Python机器学习基础教程学习笔记(5)——线性模型(回归)

2019-10-09  本文已影响0人  neumeng

Python机器学习基础教程学习笔记(5)——线性模型(回归)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn

1 线性回归处理wave数据集

1.1 wave数据集

mglearn.plots.plot_linear_regression_wave()
w[0]: 0.393906  b: -0.031804
output_3_1

1.2 线性回归

from sklearn.linear_model import LinearRegression
X,y = mglearn.datasets.make_wave(n_samples=60)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

lr = LinearRegression().fit(X_train,y_train)
# "斜率"参数(w,也叫做权重或系统),被保存到coef_属性中
print("lr.coef_:{}".format(lr.coef_))
# 偏移或者截距(b),被保存到intercept_属性中
print("lr.intercept_:{}".format(lr.intercept_))
lr.coef_:[0.39390555]
lr.intercept_:-0.031804343026759746
print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lr.score(X_test,y_test)))
Train set score:0.67
Test set score:0.66

2 线性回归处理波士顿房价数据集

2.1 波士顿房价数据集

from sklearn.datasets import load_boston
boston = load_boston()
# boston是Bunch类型数据
print("boston.keys :{}".format(boston.keys()))
boston.keys :dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
# 数据集包括506个数据点,13个特征
print("data shape :{}".format(boston.data.shape))
data shape :(506, 13)

扩展数据集:

# 通过load_extended_boston函数加载导出的数据集
X,y = mglearn.datasets.load_extended_boston()
# 最初的13个特征加上这13个特征两两组合(有放回)得到的91个特征,一共有104个特征
print("X.shape:{}".format(X.shape))
X.shape:(506, 104)

2.2 用线性回归处理

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
lr = LinearRegression().fit(X_train,y_train)
print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lr.score(X_test,y_test)))
Train set score:0.95
Test set score:0.61

3 岭回归(Ridge regression)

from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train,y_train)
print("Train set score:{:.2f}".format(ridge.score(X_train,y_train)))
print("Test set score:{:.2f}".format(ridge.score(X_test,y_test)))
Train set score:0.89
Test set score:0.75
ridge10=Ridge(alpha=10).fit(X_train,y_train)
print("Train set score:{:.2f}".format(ridge10.score(X_train,y_train)))
print("Test set score:{:.2f}".format(ridge10.score(X_test,y_test)))
Train set score:0.79
Test set score:0.64
ridge01=Ridge(alpha=0.1).fit(X_train,y_train)
print("Train set score:{:.2f}".format(ridge01.score(X_train,y_train)))
print("Test set score:{:.2f}".format(ridge01.score(X_test,y_test)))
Train set score:0.93
Test set score:0.77
ridge0001=Ridge(alpha=0.001).fit(X_train,y_train)
print("Train set score:{:.2f}".format(ridge0001.score(X_train,y_train)))
print("Test set score:{:.2f}".format(ridge0001.score(X_test,y_test)))
Train set score:0.95
Test set score:0.63
# 约束越强,y值越趋向于0
# 约束强
plt.plot(ridge10.coef_,"^",label="alpha=10")
# 约束中
plt.plot(ridge.coef_,"s",label="alpha=1")
# 约束弱
plt.plot(ridge01.coef_,"v",label="alpha=0.1")
# 无约束
plt.plot(lr.coef_,"o",label="linear")

plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.hlines(0,0,len(lr.coef_))
plt.ylim(-25,25)
plt.legend()
plt.show()
output_31_0
mglearn.plots.plot_ridge_n_samples()
output_33_0.png

4 lasso

from sklearn.linear_model import Lasso
lasso = Lasso().fit(X_train,y_train)
print("Train set score:{:.2f}".format(lasso.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lasso.score(X_test,y_test)))
print("Number of features used:{}".format(np.sum(lasso.coef_!=0)))
Train set score:0.29
Test set score:0.21
Number of features used:4
lasso001 = Lasso(alpha=0.01,max_iter=100000).fit(X_train,y_train)
print("Train set score:{:.2f}".format(lasso001.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lasso001.score(X_test,y_test)))
# 模型用了33个特征,性能好了些
print("Number of features used:{}".format(np.sum(lasso001.coef_!=0)))
Train set score:0.90
Test set score:0.77
Number of features used:33
lasso00001 = Lasso(alpha=0.0001,max_iter=100000).fit(X_train,y_train)
print("Train set score:{:.2f}".format(lasso00001.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lasso00001.score(X_test,y_test)))
# 模型用了96个特征,类似linear,过拟合
print("Number of features used:{}".format(np.sum(lasso00001.coef_!=0)))
Train set score:0.95
Test set score:0.64
Number of features used:96
plt.plot(lasso.coef_, 's', label="Lasso alpha=1")
plt.plot(lasso001.coef_, '^', label="Lasso alpha=0.01")
plt.plot(lasso00001.coef_, 'v', label="Lasso alpha=0.0001")

plt.plot(ridge01.coef_, 'o', label="Ridge alpha=0.1")
plt.legend(ncol=2, loc=(0, 1.05))
plt.ylim(-25, 25)
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.show()
output_41_0.png

实践中:

上一篇 下一篇

猜你喜欢

热点阅读