机器学习(4)梯度下降
梯度下降法简介
以下是定义了一个损失函数以后,参数theta对应的损失函数J的值对应的示例图,我们需要找到使得损失函数值J取得最小值对应的theta(这里是二维平面,也就是我们的参数只有一个)
在直线方程中,导数代表斜率 在曲线方程中,导数代表切线斜率 导数代表theta单位变化时,J相应的变化
η太小,会减慢收敛学习速度
imageη太大,甚至导致不收敛
image其他注意事项
- 并不是所有函数都有唯一的极值点
解决方案:
- 多次运行,随机化初始点
- 梯度下降法的初始点也是一个超参数
模拟梯度下降法
import numpy as np
import matplotlib.pyplot as plt
plot_x = np.linspace(-1., 6., 141)
plot_x
array([-1. , -0.95, -0.9 , -0.85, -0.8 , -0.75, -0.7 , -0.65, -0.6 ,
-0.55, -0.5 , -0.45, -0.4 , -0.35, -0.3 , -0.25, -0.2 , -0.15,
-0.1 , -0.05, 0. , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 ,
0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75,
0.8 , 0.85, 0.9 , 0.95, 1. , 1.05, 1.1 , 1.15, 1.2 ,
1.25, 1.3 , 1.35, 1.4 , 1.45, 1.5 , 1.55, 1.6 , 1.65,
1.7 , 1.75, 1.8 , 1.85, 1.9 , 1.95, 2. , 2.05, 2.1 ,
2.15, 2.2 , 2.25, 2.3 , 2.35, 2.4 , 2.45, 2.5 , 2.55,
2.6 , 2.65, 2.7 , 2.75, 2.8 , 2.85, 2.9 , 2.95, 3. ,
3.05, 3.1 , 3.15, 3.2 , 3.25, 3.3 , 3.35, 3.4 , 3.45,
3.5 , 3.55, 3.6 , 3.65, 3.7 , 3.75, 3.8 , 3.85, 3.9 ,
3.95, 4. , 4.05, 4.1 , 4.15, 4.2 , 4.25, 4.3 , 4.35,
4.4 , 4.45, 4.5 , 4.55, 4.6 , 4.65, 4.7 , 4.75, 4.8 ,
4.85, 4.9 , 4.95, 5. , 5.05, 5.1 , 5.15, 5.2 , 5.25,
5.3 , 5.35, 5.4 , 5.45, 5.5 , 5.55, 5.6 , 5.65, 5.7 ,
5.75, 5.8 , 5.85, 5.9 , 5.95, 6. ])
plot_y = (plot_x-2.5)**2 - 1.
plt.plot(plot_x, plot_y)
plt.show()
image.png
epsilon = 1e-8 #10的负8次方
eta = 0.1
def J(theta):
return (theta-2.5)**2 - 1.
def dJ(theta):
return 2*(theta-2.5)
theta = 0.0
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
if(abs(J(theta) - J(last_theta)) < epsilon):
break
print(theta)
print(J(theta))
2.499891109642585
-0.99999998814289
theta = 0.0
theta_history = [theta]
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
theta_history.append(theta)
if(abs(J(theta) - J(last_theta)) < epsilon):
break
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
plt.show()
image.png
len(theta_history)
46
theta_history = []
def gradient_descent(initial_theta, eta, epsilon=1e-8):
theta = initial_theta
theta_history.append(initial_theta)
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
theta_history.append(theta)
if(abs(J(theta) - J(last_theta)) < epsilon):
break
def plot_theta_history():
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
plt.show()
eta = 0.01
theta_history = []
gradient_descent(0, eta)
plot_theta_history()
image.png
len(theta_history)
424
eta = 0.001
theta_history = []
gradient_descent(0, eta)
plot_theta_history()
image.png
len(theta_history)
3682
eta = 0.8
theta_history = []
gradient_descent(0, eta)
plot_theta_history()
image.png
eta = 1.1
theta_history = []
gradient_descent(0, eta)
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-41-81ad09e9d3e0> in <module>()
1 eta = 1.1
2 theta_history = []
----> 3 gradient_descent(0, eta)
<ipython-input-35-37822183f4df> in gradient_descent(initial_theta, eta, epsilon)
11 theta_history.append(theta)
12
---> 13 if(abs(J(theta) - J(last_theta)) < epsilon):
14 break
15
<ipython-input-32-e8b6a6f56c24> in J(theta)
1 def J(theta):
----> 2 return (theta-2.5)**2 - 1.
3
4 def dJ(theta):
5 return 2*(theta-2.5)
OverflowError: (34, 'Result too large')
def J(theta):
try:
return (theta-2.5)**2 - 1.
except:
return float('inf')
def gradient_descent(initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
theta = initial_theta
i_iter = 0
theta_history.append(initial_theta)
while i_iter < n_iters:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta * gradient
theta_history.append(theta)
if(abs(J(theta) - J(last_theta)) < epsilon):
break
i_iter += 1
return
eta = 1.1
theta_history = []
gradient_descent(0, eta)
len(theta_history)
10001
eta = 1.1
theta_history = []
gradient_descent(0, eta, n_iters=10)
plot_theta_history()
image.png
多元线性回归中的梯度下降法
image.png一个三维空间中的梯度下降法(x,y为系数,z为损失函数)
image推导过程
image image上面推导出的式子的大小是和样本数有关的,m越大,结果越大,这是不合理的,我们希望和m无关
image.png在线性回归模型中使用梯度下降法
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x = 2 * np.random.random(size=100)
y = x * 3. + 4. + np.random.normal(size=100)
X = x.reshape(-1, 1)
X[:20]
array([[ 1.40087424],
[ 1.68837329],
[ 1.35302867],
[ 1.45571611],
[ 1.90291591],
[ 0.02540639],
[ 0.8271754 ],
[ 0.09762559],
[ 0.19985712],
[ 1.01613261],
[ 0.40049508],
[ 1.48830834],
[ 0.38578401],
[ 1.4016895 ],
[ 0.58645621],
[ 1.54895891],
[ 0.01021768],
[ 0.22571531],
[ 0.22190734],
[ 0.49533646]])
y[:20]
array([ 8.91412688, 8.89446981, 8.85921604, 9.04490343, 8.75831915,
4.01914255, 6.84103696, 4.81582242, 3.68561238, 6.46344854,
4.61756153, 8.45774339, 3.21438541, 7.98486624, 4.18885101,
8.46060979, 4.29706975, 4.06803046, 3.58490782, 7.0558176 ])
plt.scatter(x, y)
plt.show()
image.png
使用梯度下降法训练
由于本次传入的是真实数据,每个特征之间的值差距很大,依然会造成overflow
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta))**2) / len(X_b)
except:
return float('inf')
def dJ(theta, X_b, y):
res = np.empty(len(theta))
res[0] = np.sum(X_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = (X_b.dot(theta) - y).dot(X_b[:,i])
return res * 2 / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(x), 1)), x.reshape(-1,1)])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)
theta
array([ 4.02145786, 3.00706277])
封装我们的线性回归算法
import numpy as np
from .metrics import r2_score
class LinearRegression:
def __init__(self):
"""初始化Linear Regression模型"""
self.coef_ = None
self.intercept_ = None
self._theta = None
def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根据训练数据集X_train, y_train, 使用梯度下降法训练Linear Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
res = np.empty(len(theta))
res[0] = np.sum(X_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
return res * 2 / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return X_b.dot(self._theta)
def score(self, X_test, y_test):
"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(X_test)
return r2_score(y_test, y_predict)
def __repr__(self):
return "LinearRegression()"
测试我们的线性回归算法
from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_gd(X, y)
LinearRegression()
lin_reg.coef_
array([ 3.00706277])
lin_reg.intercept_
4.021457858204859
4.3 向量化
上面展开的行向量是根据式子一共有m个逐一展开的,然后点乘的方块式子就是X_b, 展开后的点乘就是原来左边的式子, 这是一个
1xm
的向量dot一个mxn+1
的矩阵,得到的是一个1xn+1
的行向量, 虽然numpy不区分行向量还是列向量,在这里最后的式子是将向量进行了整体的转置,然后进行展开得到的结果
修改之前的求导函数
def dJ(theta, X_b, y):
## res = np.empty(len(theta))
## res[0] = np.sum(X_b.dot(theta) - y)
##
## for i in range(1, len(theta)):
## res[i] = np.sum((X_b.dot(theta) - y).dot(X_b[:, i]))
##
## return res * 2 / len(X_b)
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(X_b)
梯度下降法的向量化
import numpy as np
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
from playML.LinearRegression import LinearRegression
lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(X_train, y_train)
lin_reg1.score(X_test, y_test)
CPU times: user 45.6 ms, sys: 5.74 ms, total: 51.3 ms
Wall time: 56.4 ms
0.81298026026584658
使用梯度下降法
lin_reg2 = LinearRegression()
lin_reg2.fit_gd(X_train, y_train)
/Users/yuanzhang/Dropbox/code/My MOOC/Play-with-Machine-Learning-Algorithms/06-Gradient-Descent/05-Vectorize-Gradient-Descent/playML/LinearRegression.py:32: RuntimeWarning: overflow encountered in square
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
/Users/yuanzhang/Dropbox/code/My MOOC/Play-with-Machine-Learning-Algorithms/06-Gradient-Descent/05-Vectorize-Gradient-Descent/playML/LinearRegression.py:48: RuntimeWarning: invalid value encountered in double_scalars
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
LinearRegression()
lin_reg2.coef_
array([ nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan])
lin_reg2.fit_gd(X_train, y_train, eta=0.000001)
LinearRegression()
lin_reg2.score(X_test, y_test)
0.27556634853389195
%time lin_reg2.fit_gd(X_train, y_train, eta=0.000001, n_iters=1e6)
CPU times: user 48.4 s, sys: 265 ms, total: 48.6 s
Wall time: 49.9 s
LinearRegression()
lin_reg2.score(X_test, y_test)
0.75418523539807636
使用真实的数据,调整eta和iters,要么由于eta太小导致无法得出真实的结果,要么由于eta太大导致训练时间加长,这是由于数据的规模在不同的特征上不同,所以我们需要对数据进行归一化
数据归一化
image使用梯度下降法前进行数据归一化
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(X_train_standard, y_train)
CPU times: user 237 ms, sys: 4.37 ms, total: 242 ms
Wall time: 258 ms
LinearRegression()
X_test_standard = standardScaler.transform(X_test)
lin_reg3.score(X_test_standard, y_test)
0.81298806201222351
梯度下降法的优势
m = 1000
n = 5000
big_X = np.random.normal(size=(m, n))
true_theta = np.random.uniform(0.0, 100.0, size=n+1)
big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0., 10., size=m)
big_reg1 = LinearRegression()
%time big_reg1.fit_normal(big_X, big_y)
CPU times: user 18.8 s, sys: 899 ms, total: 19.7 s
Wall time: 10.9 s
LinearRegression()
big_reg2 = LinearRegression()
%time big_reg2.fit_gd(big_X, big_y)
CPU times: user 9.51 s, sys: 121 ms, total: 9.63 s
Wall time: 5.76 s
LinearRegression()
如果样本数非常多,那么即使使用梯度下降法也会导致速度比较慢,因为在梯度下降法中,每一个样本都要参与运算。这时候需要采用随机梯度下降法。
随机梯度下降法
随机梯度下降法介绍
image批量梯度下降法带来的一个问题是η的值需要设置的比较小,在样本数比较多的时候导致不是速度特别慢,这时候观察随机梯度下降法损失函数的求导公式,可以发现,我们对每一个Xb都做了求和操作,又在最外面除以了m,那么可以考虑将求和和除以m的两个运算约掉,采用每次使用一个随机的Xb
由于我们使用的事随机梯度下降法,所以导致我们的最终结果不会像批量梯度下降法一样准确的朝着一个方向运算,而是曲线行下降,这时候我们就希望,越到下面,η值相应减小,事运算次数变多,从而精确计算结果
这里使用了模拟退火的思想
随机梯度下降法:它的具体思路是在更新每一参数时都使用一个样本来进行更新,也就是方程中的m等于1。每一次跟新参数都用一个样本,更新很多次。如果样本量很大的情况(例如几十万),那么可能只用其中几万条或者几千条的样本,就已经将theta迭代到最优解了,对比上面的批量梯度下降,迭代一次需要用到十几万训练样本,一次迭代不可能最优,如果迭代10次的话就需要遍历训练样本10次,这种跟新方式计算复杂度太高。
但是,SGD伴随的一个问题是噪音较BGD要多,使得SGD并不是每次迭代都向着整体最优化方向。
优点:训练速度快;
缺点:准确度下降,并不是全局最优;不易于并行实现。
从迭代的次数上来看,SGD迭代的次数较多,在解空间的搜索过程看起来很盲目。
批量梯度下降法(Batch Gradient Descent,简称BGD)是梯度下降法最原始的形式,它的具体思路是在更新每一参数时都使用所有的样本来进行更新,也就是方程(1)中的m表示样本的所有个数。
优点:全局最优解;易于并行实现;
缺点:当样本数目很多时,训练过程会很慢。
小批量梯度下降法(Mini-batch Gradient Descent,简称MBGD):它的具体思路是在更新每一参数时都使用一部分样本来进行更新,也就是方程(1)中的m的值大于1小于所有样本的数量。为了克服上面两种方法的缺点,又同时兼顾两种方法的有点。
- 从公式中可以看出,批量梯度下降算法每迭代一步,要用到训练集的所有样本,最后得到的是一个全局最优解。
- 随机梯度的权值更新公式里调整项没有累加符号,说明随机梯度下降中只用到了训练集的一个样本,最后得到的可能是全局最优解,也可能是局部最优解。
随机梯度下降法
import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)
plt.scatter(x, y)
plt.show()
image.png
普通梯度下降法
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)
CPU times: user 2.81 s, sys: 134 ms, total: 2.94 s
Wall time: 1.88 s
theta
array([ 3.00383464, 4.01706856])
随机梯度下降法
def dJ_sgd(theta, X_b_i, y_i):
return 2 * X_b_i.T.dot(X_b_i.dot(theta) - y_i)
def sgd(X_b, y, initial_theta, n_iters):
t0, t1 = 5, 50
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
for cur_iter in range(n_iters):
rand_i = np.random.randint(len(X_b))
gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
theta = theta - learning_rate(cur_iter) * gradient
return theta
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=m//3)
CPU times: user 559 ms, sys: 22.6 ms, total: 582 ms
Wall time: 647 ms
theta
array([ 3.04732375, 4.03214249])
随机梯度下降法的封装和测试
def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
"""
根据训练数据集X_train, y_train, 使用随机梯度下降法训练Linear Regression模型
:param X_train:
:param y_train:
:param n_iters: 在随机梯度下降法中,n_iters代表所有的样本会被看几圈
:param t0:
:param t1:
:return:
"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert n_iters >= 1
def dJ_sgd(theta, X_b_i, y_i):
"""
去X_b,y 中的随机一个元素进行导数公式的计算
:param theta:
:param X_b_i:
:param y_i:
:return:
"""
return X_b_i * (X_b_i.dot(theta) - y_i) * 2.
def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
def learning_rate(t):
"""
计算学习率,t1 为了减慢变化速度,t0为了增加随机性
:param t: 第t次循环
:return:
"""
return t0 / (t + t1)
theta = initial_theta
m = len(X_b)
for cur_iter in range(n_iters):
## 对X_b进行一个乱序的排序
indexes = np.random.permutation(m)
X_b_new = X_b[indexes]
y_new = y[indexes]
## 对整个数据集看一遍
for i in range(m):
gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(cur_iter * m + i) * gradient
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.random.randn(X_b.shape[1])
self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
封装我们自己的SGD
import numpy as np
import matplotlib.pyplot as plt
m = 100000
x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)
from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit_bgd(X, y)
print(lin_reg.intercept_, lin_reg.coef_)
3.01644183437 [ 3.99995653]
lin_reg = LinearRegression()
lin_reg.fit_sgd(X, y, n_iters=2)
print(lin_reg.intercept_, lin_reg.coef_)
2.99558568395 [ 4.02610767]
真实使用我们自己的SGD
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)
from playML.LinearRegression import LinearRegression
lin_reg = LinearRegression()
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=2)
lin_reg.score(X_test_standard, y_test)
CPU times: user 11.9 ms, sys: 4.13 ms, total: 16.1 ms
Wall time: 13.5 ms
0.78651716204682975
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=50)
lin_reg.score(X_test_standard, y_test)
CPU times: user 155 ms, sys: 8.11 ms, total: 163 ms
Wall time: 158 ms
0.80857287165738345
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=100)
lin_reg.score(X_test_standard, y_test)
CPU times: user 282 ms, sys: 5.11 ms, total: 287 ms
Wall time: 287 ms
0.81294846132723497
scikit-learn中的SGD
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)
CPU times: user 631 µs, sys: 54 µs, total: 685 µs
Wall time: 690 µs
0.80478459701573024
sgd_reg = SGDRegressor(n_iter=50)
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)
CPU times: user 4.5 ms, sys: 1.04 ms, total: 5.54 ms
Wall time: 3.94 ms
0.81199073931878407
需要注意的是sklearn中的梯度下降法比我们自己的算法要复杂的多,性能和计算准确度上都比我们的要好,我们的算法只是用来演示过程,具体生产上的使用还是应该使用Sklearn提供的
梯度下降法 的调试
梯度下降法调试的原理
可能我们计算出梯度下降法的公式,并使用python编程实现,预测的过程中程序并没有报错,但是可能我们需要求的梯度的结果是错误的,这个时候需要怎么样去调试发现错误呢。
首先以二维坐标平面为例,一个点(O)的导数就是曲线在这个点的切线的斜率,在这个点两侧各取一个点(AB),那么AB两点对应的直线的斜率应该大体等于O的切线的斜率,并且这A和B的距离越近,那么两条直线的斜率就越接近 事实上,这也正是导数的定义,当函数y=f(x)的自变量x在一点x0上产生一个增量Δx时,函数输出值的增量Δy与自变量增量Δx的比值在Δx趋于0时的极限a如果存在,a即为在x0处的导数,记作f'(x0)或df(x0)/dx
image扩展到多维维度则如下
image梯度下降法调试
关于梯度的计算调试
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
X = np.random.random(size=(1000, 10))
true_theta = np.arange(1, 12, dtype=float)
X_b = np.hstack([np.ones((len(X), 1)), X])
y = X_b.dot(true_theta) + np.random.normal(size=1000)
true_theta
array([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.])
X.shape
(1000, 10)
y.shape
(1000,)
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta))**2) / len(X_b)
except:
return float('inf')
def dJ_math(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
def dJ_debug(theta, X_b, y, epsilon=0.01):
res = np.empty(len(theta))
for i in range(len(theta)):
theta_1 = theta.copy()
theta_1[i] += epsilon
theta_2 = theta.copy()
theta_2[i] -= epsilon
res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)
return res
def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
%time theta = gradient_descent(dJ_debug, X_b, y, initial_theta, eta)
theta
CPU times: user 13.8 s, sys: 283 ms, total: 14.1 s
Wall time: 7.6 s
array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968,
5.05002117, 5.90494046, 6.97383745, 8.00088367,
8.86213468, 9.98608331, 10.90529198])
%time theta = gradient_descent(dJ_math, X_b, y, initial_theta, eta)
theta
CPU times: user 1.57 s, sys: 30.6 ms, total: 1.6 s
Wall time: 856 ms
array([ 1.1251597 , 2.05312521, 2.91522497, 4.11895968,
5.05002117, 5.90494046, 6.97383745, 8.00088367,
8.86213468, 9.98608331, 10.90529198])
由此可以看出,我们的dJ_debug和dJ_math的结果是相近的,所以我们的dJ_math的数学推导是没问题的。
我们可以在真正的机器学习之前,先使用dJ_debug这种调试方式来验证一下我们的dJ_math的结果是否正确,然后再进行机器学习。
dJ_debug是通用的,可以放在任何求导的debug过程中,所以可以作为我们机器学习的工具箱来使用.