学习笔记:sklearn-决策树

2021-06-16  本文已影响0人  zeolite

分类树
数据准备

from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

wine = load_wine()

X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, test_size=0.2)

clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
pred = clf.predict(X_test)

查看特征名称

wine.feature_names

查看类别名称

wine.target_names

查看特征重要性

clf.feature_importances_

查看特征对应的重要性

[*zip(wine.feature_names, clf.feature_importances_)]

决策树随机分支 splitter = best/random
最大深度 max_depth
最少节点分支 min_samples_split
最少子节点分支 min_samples_leaf
限制特征个数 max_features

clf=tree.DecisionTreeClassifier(criterion='entropy', 
                                random_state=0, 
                                splitter='random',
                                max_depth=3,
                                min_samples_leaf=2,
                                min_samples_split=3,
                                max_features=10
                               )

回归树
交叉验证 cross_val_score
负的均方误差 -MSE neg_mean_squared_error

from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score

boston=load_boston()

regr=DecisionTreeRegressor(random_state=0)
score=cross_val_score(regr,boston.data, boston.target, cv=10, scoring='neg_mean_squared_error')
score.mean()

GridSearchCV使用 泰坦尼克号数据

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

data=pd.read_csv(r'./train.csv', index_col=0)

data.drop(['Cabin','Name','Ticket'], inplace=True, axis=1)

data['Age']=data['Age'].fillna(data['Age'].mean())

data['Sex']=(data['Sex']=='male').astype('int')

labels=data['Embarked'].unique().tolist()
data['Embarked']=data['Embarked'].apply(lambda x:labels.index(x))

X=data.drop(columns='Survived')
y=data['Survived']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

clf=DecisionTreeClassifier(random_state=0)

params={'splitter':('best', 'random'),
       'criterion':('gini', 'entropy'),
       'max_depth':[*range(1,10)],
       'min_samples_leaf':[*range(1,50,5)],
       'min_impurity_decrease':[*np.linspace(0,0.5,20)]}

GS=GridSearchCV(clf, params, cv=10)
GS.fit(X_train, y_train)

查看参数

GS.best_params_

查看分数

GS.best_score_

交叉验证

clf=DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=5, min_samples_leaf=1, splitter='random')
score=cross_val_score(clf,X_train, y_train, cv=10)
score.mean()

随机森林分类

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_score

wine=load_wine()

rfc=RandomForestClassifier(n_estimators=25)
rfc_s=cross_val_score(rfc, wine.data, wine.target, cv=10)
rfc_s.mean()

查看随机森林中树的参数

rfc.estimators_
rfc.estimators_[0]

bootstrap默认True 采用有放回随机抽样技术
oob_score=True 使用袋外数据进行模型测试

rfc=RandomForestClassifier(n_estimators=25, oob_score=True)
rfc=rfc.fit(wine.data, wine.target)
rfc.oob_score_

查看重要特征

[*zip(wine.feature_names, rfc.feature_importances_)]

查看样本概率

rfc.predict_proba(wine.data)

随机森林回归

from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

boston=load_boston()
regressor=RandomForestRegressor(n_estimators=50, random_state=0)
cross_val_score(regressor, boston.data, boston.target, cv=10)

SimpleImputer填充nan值

from sklearn.impute import SimpleImputer

SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_missing)

SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0).fit_transform(X_missing)

随机森林分类 GridSearch使用

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

data=load_breast_cancer()

rfc=RandomForestClassifier(n_estimators=50, random_state=10)
param_grid={'criterion':['gini', 'entropy'],
           'max_depth':np.arange(1,10,1),
           'n_estimators': np.arange(1,100,10),
           'min_samples_leaf':np.arange(2,10,1),
           'min_samples_split':np.arange(2,7,1),
           'max_leaf_nodes':np.arange(25,50,1)}
GS=GridSearchCV(rfc, param_grid, cv=10)
GS.fit(data.data, data.target)

GS.best_params_

GS.best_score_

上一篇 下一篇

猜你喜欢

热点阅读