06基于随机森林模型的心脏病患者预测分类

2022-12-31 本文已影响0人 Jachin111

导入库

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance
import shap
from pdpbox import pdp,info_plots

np.random.seed(123)
pd.options.mode.chained_assignment = None

import warnings
warnings.filterwarnings("ignore")

数据探索EDA

# 导入数据
df = pd.read_excel("heart.xlsx")
df.head()

image.png

# 缺失值情况
df.isnull().sum()

image.png

字段转化

# 转化编码
df["sex"][df["sex"]==0] = "female"
df["sex"][df["sex"]==1] = "male"

df["cp"][df["cp"]==1] = "typical angina"
df["cp"][df["cp"]==2] = "atypical angina"
df["cp"][df["cp"]==3] = "non-anginal pain"
df["cp"][df["cp"]==4] = "asymptomatic"

df["fbs"][df["fbs"]==0] = "lower than 120mg/ml"
df["fbs"][df["fbs"]==1] = "greater than 120mg/ml"

df["restecg"][df["restecg"]==0] = "normal"
df["restecg"][df["restecg"]==1] = "ST-T wave abnormality"
df["restecg"][df["restecg"]==2] = "left ventricular hypertrophy"

df["exang"][df["exang"]==0] = "no"
df["exang"][df["exang"]==1] = "yes"

df["slope"][df["slope"]==1] = "upsloping"
df["slope"][df["slope"]==2] = "flat"
df["slope"][df["slope"]==3] = "downsloping"

df["thal"][df["thal"]==1] = "normal"
df["thal"][df["thal"]==2] = "fixed defect"
df["thal"][df["thal"]==3] = "reversable defect"

# 字段类型转化
df["sex"] = df["sex"].astype("object")
df["cp"] = df["cp"].astype("object")
df["fbs"] = df["fbs"].astype("object")
df["restecg"] = df["restecg"].astype("object")
df["exang"] = df["exang"].astype("object")
df["slope"] = df["slope"].astype("object")
df["thal"] = df["thal"].astype("object")

# 生成哑变量
df = pd.get_dummies(df,drop_first=True)
df

image.png

随机森林RandomForest

# 切分数据
X = df.drop("target",1)
y = df["target"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)
X_train

image.png

# 建模
rf = RandomForestClassifier(max_depth=5)
rf.fit(X_train,y_train)

image.png
3个重要属性
查看森林中树的状况：estimators_
袋外估计准确率得分：oob_score_，必须是oob_score参数选择True的时候才可用
变量的重要性：feature_importances_

# 决策树可视化
estimator = rf.estimators_[1]
feature_names = [i for i in X_train.columns]

y_train_str = y_train.astype("str")
y_train_str[y_train_str=="0"] = "no disease"
y_train_str[y_train_str=="1"] = "disease"
y_train_str = y_train_str.values
y_train_str[:5]

image.png

export_graphviz(
    estimator,   
    out_file='tree.dot',   
    feature_names = feature_names,  
    class_names = y_train_str,  
    rounded = True, 
    proportion = True, 
    label='root',
    precision = 2, 
    filled = True
)

from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

from IPython.display import Image
Image(filename = 'tree.png')

# 模型得分验证
y_predict = rf.predict(X_test)
y_pred_quant = rf.predict_proba(X_test)[:,1]
y_pred_bin = rf.predict(X_test)

confusion_matrix = confusion_matrix(y_test,y_pred_bin)
confusion_matrix

image.png

total = sum(sum(confusion_matrix))
sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print("Sensitivity: ",sensitivity)

image.png

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print("Specificity: ",specificity)

image.png

# 绘制ROC曲线
fpr, tpr, thresholds = roc_curve(y_test,y_pred_quant)

fig, ax = plt.subplots()
ax.plot(fpr,tpr)
ax.plot([0,1],[0,1],transform=ax.transAxes,ls="--",c=".3")
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.rcParams["font.size"] = 12
plt.title("ROC Curve")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.grid(True)

image.png

auc(fpr,tpr)

image.png
常见的评价指标：
1、ACC：classification accuracy，描述分类器的分类准确率,计算公式为：ACC=(TP+TN)/(TP+FP+FN+TN)
2、BER：balanced error rate 计算公式为：BER=1/2*(FPR+FN/(FN+TP))
3、TPR：true positive rate，描述识别出的所有正例占所有正例的比例计算公式为：TPR=TP / (TP+ FN)
4、FPR：false positive rate，描述将负例识别为正例的情况占所有负例的比例计算公式为：FPR= FP / (FP + TN)
5、TNR：true negative rate，描述识别出的负例占所有负例的比例计算公式为：TNR= TN / (FP + TN)
6、PPV：Positive predictive value计算公式为：PPV=TP / (TP + FP)
7、NPV：Negative predictive value计算公式：NPV=TN / (FN + TN)
其中TPR即为敏感度（sensitivity），TNR即为特异度（specificity）

可解释性

# 排列重要性-Permutation Importance
perm = PermutationImportance(rf,random_state=10).fit(X_test,y_test)
perm

image.png

eli5.show_weights(perm,feature_names=X_test.columns.tolist())

image.png

部分依赖图(Partial dependence plots,PDP)

Partial Dependence就是用来解释某个特征和目标值y的关系的，一般是通过画出Partial Dependence Plot(PDP)来体现。也就是说PDP在X1的值，就是把训练集中第一个变量换成X1之后，原模型预测出来的平均值。

# 字段ca
base_features = df.columns.values.tolist()
base_features.remove("target")

feat_name = "ca"
pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)

plt.show()

image.png

# 字段age
feat_name = "age"

pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)

plt.show()

image.png

# 字段oldpeak
feat_name = "oldpeak"

pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)

plt.show()

image.png

# 2D-PDP图
# 查看的是 slope_upsloping 、slope_flat和 oldpeak的关系：
inter1 = pdp.pdp_interact(model=rf,dataset=X_test,model_features=base_features,features=["slope_upsloping","oldpeak"])
pdp.pdp_interact_plot(pdp_interact_out=inter1,feature_names=["slope_upsloping","oldpeak"],plot_type="contour")

plt.show()

image.png

inter1 = pdp.pdp_interact(model=rf,dataset=X_test,model_features=base_features,features=["slope_flat","oldpeak"])
pdp.pdp_interact_plot(pdp_interact_out=inter1,feature_names=["slope_flat","oldpeak"],plot_type="contour")

plt.show()

image.png

SHAP可视化

# Explainer
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap_values

image.png

# Feature Importance
shap.summary_plot(shap_values[1],X_test,plot_type="bar")

image.png

# summary_plot
shap.summary_plot(shap_values[1],X_test)

image.png

# 个体差异
# 查看单个病人的不同特征属性对其结果的影响
def heart_disease_risk_factors(model,patient):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(patient)
    shap.initjs()
    return shap.force_plot(explainer.expected_value[1],shap_values[1],patient)

data_for_prediction = X_test.iloc[1,:].astype(float)
heart_disease_risk_factors(rf,data_for_prediction)

image.png

data_for_prediction = X_test.iloc[3,:].astype(float)
heart_disease_risk_factors(rf,data_for_prediction)

image.png

# dependence_plot
# 为了理解单个feature如何影响模型的输出，我们可以将该feature的SHAP值与数据集中所有样本的feature值进行比较：
ax2 = fig.add_subplot(224)
shap.dependence_plot("ca",shap_values[1],X_test,interaction_index="oldpeak")

image.png

# 多样本可视化探索
shap_values = explainer.shap_values(X_train.iloc[:50])
shap.force_plot(explainer.expected_value[1],shap_values[1],X_test.iloc[:50])

image.png

06基于随机森林模型的心脏病患者预测分类

导入库

数据探索EDA

字段转化

随机森林RandomForest

可解释性

部分依赖图(Partial dependence plots,PDP)

SHAP可视化

猜你喜欢

热点阅读