03:6大监督学习模型:毒蘑菇分类
2022-12-21 本文已影响0人
Jachin111
数据EDA
# 导入数据
import pandas as pd
import numpy as np
import plotly_express as px
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("mushrooms.csv")
data.shape
![](https://img.haomeiwen.com/i9248907/36efaa6a30abff2d.png)
data.columns
![](https://img.haomeiwen.com/i9248907/cccab8922ee71f80.png)
# 失值
data.isnull().sum()
![](https://img.haomeiwen.com/i9248907/3fd0ed6fbece178e.png)
# 有无毒对比
data["class"].value_counts()
![](https://img.haomeiwen.com/i9248907/f6ab4240a4789b89.png)
可视化分析
cap = data["cap-color"].value_counts().reset_index()
cap.columns = ["color","number"]
cap
![](https://img.haomeiwen.com/i9248907/439b1ef7d9ece890.png)
fig = px.bar(cap,x="color",
y="number",
color="number",
text="number",
color_continuous_scale="rainbow")
fig.show()
![](https://img.haomeiwen.com/i9248907/5532c9da85781ddc.png)
cap_class = data.groupby(["class","cap-color"]).size().reset_index()
cap_class.columns = ["class","color","number"]
cap_class.head()
![](https://img.haomeiwen.com/i9248907/ce5bd9f44244cfb6.png)
fig = px.bar(cap_class,x="color",
y="number",
color="class",
text="number",
barmode="group")
fig.show()
![](https://img.haomeiwen.com/i9248907/04c641190dbff739.png)
# 菌的气味
odor = data["odor"].value_counts().reset_index()
odor.columns = ["odor","number"]
odor
![](https://img.haomeiwen.com/i9248907/a5437f517687ddab.png)
fig = px.bar(odor,
x="odor",
y="number",
color="number",
text="number",
color_continuous_scale="rainbow")
fig.show()
![](https://img.haomeiwen.com/i9248907/ac5c9a06f94a0cc3.png)
odor_class = data.groupby(["class","odor"]).size().reset_index()
odor_class.columns = ["class","odor","number"]
odor_class.head()
![](https://img.haomeiwen.com/i9248907/597ecc508cee39b5.png)
fig = px.bar(odor_class,
x="odor",
y="number",
color="class",
text="number",
barmode="group")
fig.show()
![](https://img.haomeiwen.com/i9248907/242d333f0576cea2.png)
特征工程
# 特征转换
data.head()
![](https://img.haomeiwen.com/i9248907/af1f740ce6b9bd5b.png)
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for col in data.columns:
data[col] = labelencoder.fit_transform(data[col])
data.head()
![](https://img.haomeiwen.com/i9248907/107ea4ae973d2423.png)
data["stalk-color-above-ring"].unique()
![](https://img.haomeiwen.com/i9248907/7c3680aed35b3911.png)
data.groupby("class").size()
![](https://img.haomeiwen.com/i9248907/45a7b851e31395c3.png)
# 数据分布
data["stalk-color-above-ring"].value_counts()
![](https://img.haomeiwen.com/i9248907/a8a58489fcc59eb8.png)
ax = sns.boxplot(x='class',
y='stalk-color-above-ring',
data=data)
ax = sns.stripplot(x='class',
y='stalk-color-above-ring',
data=data,
jitter=True,
edgecolor="gray")
plt.title("Class w.r.t stalkcolor above ring",fontsize=12)
plt.show()
![](https://img.haomeiwen.com/i9248907/dc635f8f42050e7b.png)
# 分离特征和标签
x = data.iloc[:,1:23]
y = data.iloc[:,0]
# 数据标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(x)
X
![](https://img.haomeiwen.com/i9248907/b4944435ff5a6743.png)
特征相关性
corr = data.corr()
sns.heatmap(corr)
plt.show()
![](https://img.haomeiwen.com/i9248907/e89fde08dc01b89d.png)
主成分分析PCA
# PCA过程
from sklearn.decomposition import PCA
pca = PCA()
pca.fit_transform(X)
covariance = pca.get_covariance()
explained_variance = pca.explained_variance_
explained_variance
![](https://img.haomeiwen.com/i9248907/f39358bcd5a2fa65.png)
with plt.style.context("dark_background"):
plt.figure(figsize=(6,4))
plt.bar(range(22),
explained_variance,
alpha=0.5,
align="center",
label="individual explained variance")
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc="best")
plt.tight_layout()
![](https://img.haomeiwen.com/i9248907/42d67680f01baa6c.png)
# 2个主成分下的原始数据分布
N = data.values
pca = PCA(n_components=2)
x = pca.fit_transform(N)
plt.figure(figsize=(5,5))
plt.scatter(x[:,0],x[:,1])
plt.show()
![](https://img.haomeiwen.com/i9248907/fdb67290e2b3e56d.png)
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2,random_state=5)
N = data.values
X_clustered = km.fit_predict(N)
label_color_map = {0:"g",1:"y"}
label_color = [label_color_map[l] for l in X_clustered]
plt.figure(figsize=(5,5))
plt.scatter(x[:,0],x[:,1],c=label_color)
plt.show()
![](https://img.haomeiwen.com/i9248907/559304d74ed3fbfd.png)
# 基于17主成分下的建模
pca_modified = PCA(n_components=17)
pca_modified.fit_transform(X)
![](https://img.haomeiwen.com/i9248907/bc5a03523d8c13d3.png)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
模型1:逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
model_LR = LogisticRegression()
model_LR.fit(X_train,y_train)
![](https://img.haomeiwen.com/i9248907/5063171cfef4d141.png)
y_prob = model_LR.predict_proba(X_test)[:,1]
y_prob
![](https://img.haomeiwen.com/i9248907/7401d39fdb786165.png)
y_pred = np.where(y_prob>0.5,1,0)
y_pred
![](https://img.haomeiwen.com/i9248907/ba01cc46b7bfc7e6.png)
model_LR.score(X_test,y_pred)
![](https://img.haomeiwen.com/i9248907/cdd0736b87efc056.png)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
![](https://img.haomeiwen.com/i9248907/76ac2a5adbaf6c6c.png)
auc_roc = metrics.roc_auc_score(y_test,y_pred)
auc_roc
![](https://img.haomeiwen.com/i9248907/e0ade54cb27e32c5.png)
# 真假阳性
from sklearn.metrics import roc_curve,auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
![](https://img.haomeiwen.com/i9248907/489805b733712a1e.png)
# ROC曲线
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC=%0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
![](https://img.haomeiwen.com/i9248907/a37385bdcf16a8ee.png)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
LR_model = LogisticRegression()
tuned_parameters = {"C":[0.001,0.01,0.1,1,10,100,1000],
"penalty":['l1','l2']}
from sklearn.model_selection import GridSearchCV
LR = GridSearchCV(LR_model,tuned_parameters,cv=10)
LR.fit(X_train,y_train)
print(LR.best_params_)
![](https://img.haomeiwen.com/i9248907/2f717e57bacd824a.png)
y_prob = LR.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob>0.5,1,0)
LR.score(X_test,y_pred)
![](https://img.haomeiwen.com/i9248907/ab0c02a3f00efa34.png)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
![](https://img.haomeiwen.com/i9248907/771f2a1cafe45f81.png)
auc_roc = metrics.classification_report(y_test,y_pred)
print(auc_roc)
![](https://img.haomeiwen.com/i9248907/4885c70a4fc56b7c.png)
auc_roc = metrics.roc_auc_score(y_test,y_pred)
auc_roc
![](https://img.haomeiwen.com/i9248907/6c9a6700aafd585c.png)
# ROC曲线情况
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
![](https://img.haomeiwen.com/i9248907/32a0ba525fc1cff5.png)
模型2:高斯朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
model_naive = GaussianNB()
model_naive.fit(X_train,y_train)
y_prob = model_naive.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob>0.5,1,0)
model_naive.score(X_test,y_pred)
![](https://img.haomeiwen.com/i9248907/56b55ff0d720ed90.png)
print(f"Number of mislabeled points from {X_test.shape[0]} points:{(y_test!=y_pred).sum()}")
![](https://img.haomeiwen.com/i9248907/b859720a7f486d25.png)
# 交叉验证
scores = cross_val_score(model_naive,X,y,cv=10,scoring="accuracy")
scores
![](https://img.haomeiwen.com/i9248907/419cb8212c3366f7.png)
scores.mean()
![](https://img.haomeiwen.com/i9248907/7ff7e0ff327f62c6.png)
# 混淆矩阵和AUC
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
![](https://img.haomeiwen.com/i9248907/abbc9dc09ee8a3e4.png)
auc_roc = metrics.classification_report(y_test,y_pred)
print(auc_roc)
![](https://img.haomeiwen.com/i9248907/d7873ce6ca1390da.png)
# 真假阳性
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
![](https://img.haomeiwen.com/i9248907/d3009c2d49a268d5.png)
# ROC曲线
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle='--')
plt.axis("tight")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()
![](https://img.haomeiwen.com/i9248907/ad3f0e9dfdf8e7ca.png)
模型3:支持向量机SVM
# 建模过程
from sklearn.svm import SVC
svm_model = SVC()
tuned_parameters = {
'C':[1,10,100,500,1000],
'kernel':['linear','rbf'],
'C':[1,10,100,500,1000],
'gamma':[1,0.1,0.01,0.001,0.0001],
'kernel':['rbf']
}
# 随机网络搜索-RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
model_svm = RandomizedSearchCV(svm_model,
tuned_parameters,
cv=10,
scoring="accuracy",
n_iter=20)
model_svm.fit(X_train,y_train)
![](https://img.haomeiwen.com/i9248907/ae7cd9c202caf642.png)
print(model_svm.best_score_)
![](https://img.haomeiwen.com/i9248907/f77b15835aa534f3.png)
model_svm.best_params_
![](https://img.haomeiwen.com/i9248907/f07789d9e0bfef5e.png)
y_pred = model_svm.predict(X_test)
metrics.accuracy_score(y_pred,y_test)
![](https://img.haomeiwen.com/i9248907/8f7c8453f2763512.png)
# 混淆矩阵
metrics.confusion_matrix(y_test,y_pred)
![](https://img.haomeiwen.com/i9248907/d51d0b640adf21d9.png)
print(metrics.classification_report(y_test,y_pred))
![](https://img.haomeiwen.com/i9248907/790caa11ea31f0ca.png)
metrics.roc_auc_score(y_test,y_pred)
![](https://img.haomeiwen.com/i9248907/109b5bfe38c644f3.png)
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_pred)
roc_auc = auc(false_positive_rate,true_positive_rate)
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
![](https://img.haomeiwen.com/i9248907/77a1e9a082e38639.png)
模型4:随机森林
# 建模拟合
from sklearn.ensemble import RandomForestClassifier
model_RR = RandomForestClassifier()
model_RR.fit(X_train,y_train)
![](https://img.haomeiwen.com/i9248907/376f45ca3a50e19f.png)
# 预测得分
y_prob = model_RR.predict_proba(X_test)[:,1]
y_prob
![](https://img.haomeiwen.com/i9248907/0ff4b2cc9ce9b483.png)
y_pred = np.where(y_prob>0.5,1,0)
model_RR.score(X_test,y_pred)
![](https://img.haomeiwen.com/i9248907/97f561f31f12ddf7.png)
# 混淆矩阵
metrics.confusion_matrix(y_test,y_pred)
![](https://img.haomeiwen.com/i9248907/cd495f4268211ca3.png)
print(metrics.classification_report(y_test,y_pred))
![](https://img.haomeiwen.com/i9248907/53137783655b6737.png)
metrics.roc_auc_score(y_test,y_pred)
![](https://img.haomeiwen.com/i9248907/bde60469659988d7.png)
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
![](https://img.haomeiwen.com/i9248907/1dfa5eaad525a210.png)
模型5:决策树(CART)
# 建模
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train,y_train)
y_prob = model_tree.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob>0.5,1,0)
model_tree.score(X_test,y_pred)
![](https://img.haomeiwen.com/i9248907/46c5ecb923c9f3e4.png)
# 混淆矩阵
metrics.confusion_matrix(y_test,y_pred)
![](https://img.haomeiwen.com/i9248907/d04eb775a73e01c7.png)
print(metrics.classification_report(y_test,y_pred))
![](https://img.haomeiwen.com/i9248907/051d813073dee860.png)
metrics.roc_auc_score(y_test,y_pred)
![](https://img.haomeiwen.com/i9248907/26e312a38fe18cdc.png)
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()
![](https://img.haomeiwen.com/i9248907/7b5487bbd2529c11.png)
模型6:神经网络ANN
# 建模
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(X_train,y_train)
![](https://img.haomeiwen.com/i9248907/41f33fa018335e73.png)
y_prob = mlp.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob>0.5,1,0)
mlp.score(X_test,y_pred)
![](https://img.haomeiwen.com/i9248907/ff0845e9cf1644a2.png)
# 混淆矩阵
metrics.confusion_matrix(y_test,y_pred)
![](https://img.haomeiwen.com/i9248907/327cbf7560124d56.png)
print(metrics.classification_report(y_test,y_pred))
![](https://img.haomeiwen.com/i9248907/23fa4d4d7bbcfb78.png)
metrics.roc_auc_score(y_test,y_pred)
![](https://img.haomeiwen.com/i9248907/64e1ec87d3ca29e3.png)
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
![](https://img.haomeiwen.com/i9248907/b07ccf84932ef699.png)
# 网格搜索
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier()
tuned_parameters = {'hidden_layer_sizes':range(1,200,10),
'activation':['tanh','logistic','relu'],
'alpha':[0.0001,0.001,0.01,0.1,1,10],
'max_iter':range(50,200,50)}
model_mlp = RandomizedSearchCV(mlp_model,tuned_parameters,cv=10,scoring='accuracy',n_iter=5,n_jobs=-1,random_state=5)
model_mlp.fit(X_train,y_train)
![](https://img.haomeiwen.com/i9248907/b580fc8bbd9b6613.png)
# 模型属性
model_mlp.best_score_
![](https://img.haomeiwen.com/i9248907/fe25c4633498d6f7.png)
model_mlp.best_params_
![](https://img.haomeiwen.com/i9248907/6a56284b716fb519.png)
model_svm.cv_results_
![](https://img.haomeiwen.com/i9248907/51e0feef436ce838.png)
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()
![](https://img.haomeiwen.com/i9248907/46f2aa8e6ad263bc.png)