数据分析工具--python篇

sklearn模型评估--Cross-validation

2019-03-05  本文已影响0人  粉红狐狸_dhf

Cross--validation: evaluating estimator performance

出处:https://scikit-learn.org/stable/modules/cross_validation.html

import numpyas np

from sklearn.model_selectionimport train_test_split,cross_val_score

from sklearnimport svm,datasets

from sklearnimport preprocessing

'''官方文档说明:https://scikit-learn.org/stable/modules/cross_validation.html'''

iris = datasets.load_iris()

iris.data.shape, iris.target.shape

#((150, 4), (150,))

'''交叉验证-----法一:train_test_split'''

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,test_size=0.4,random_state=0)

X_train.shape, y_train.shape

#((90, 4), (90,))

X_test.shape, y_test.shape

#((60, 4), (60,))

clf = svm.SVC(kernel='linear',C=1).fit(X_train, y_train)

clf.score(X_test, y_test)

'''The simplest way to use cross-validation is to call the cross_val_score helper function

on the estimator and the dataset.'''

#交叉验证-----法e二:cross_val_score  喂入全部数据 通过cv设定

clf = svm.SVC(kernel='linear',C=1)

scores = cross_val_score(clf, iris.data, iris.target,cv=5,scoring='f1_macro')

#scoring='score'默认

print(scores)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() *2))

#Accuracy: 0.98 (+/- 0.03)

'''对验证集和训练集同时标准化处理'''

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,test_size=0.4,random_state=0)

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_transformed = scaler.transform(X_train)

clf = svm.SVC(C=1).fit(X_train_transformed, y_train)

X_test_transformed = scaler.transform(X_test)

clf.score(X_test_transformed, y_test)#0.9333...

''' Pipeline 对标准化处理和交叉验证同时进行'''

from sklearn.pipelineimport make_pipeline

clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))

cross_val_score(clf, iris.data, iris.target,cv=5)

#-----------cross_validate----------

''' The cross_validate function and multiple metric evaluation:不同于cross_val_score,允许评估多个值'''

from sklearn.model_selectionimport cross_validate

from sklearn.metricsimport recall_score

scoring = ['precision_macro','recall_macro']

clf = svm.SVC(kernel='linear',C=1,random_state=0)

scores = cross_validate(clf, iris.data, iris.target,scoring=scoring,cv=5,return_train_score=False)

sorted(scores.keys())

#['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']

print(scores['test_recall_macro'])

#--------Cross validation iterators------------

'''根据不同的交叉验证策略生成数据集分割。应用于独立同分布'''

'''1.KFold--Note that KFold is not affected by classes or groups,of equal sizes (if possible)'''

from sklearn.model_selectionimport KFold

X = ["a","b","c","d"]

kf = KFold(n_splits=2)

for train, testin kf.split(X):

print("%s %s" % (train, test))

'''2.Repeated KFold--repeats Stratified K-Fold n times with different randomization in each repetition'''

from sklearn.model_selectionimport RepeatedKFold

X = np.array([[1,2], [3,4], [1,2], [3,4]])

random_state =12883823

rkf = RepeatedKFold(n_splits=2,n_repeats=2,random_state=random_state)

for train, testin rkf.split(X):

print("%s %s" % (train, test))

'''3.Each learning set is created by taking all the samples except one,

the test set being the sample left out.'''

'''作为一般规则,大多数作者和经验证据表明,5或10倍的交叉验证应该优先于LOO。'''

from sklearn.model_selectionimport LeaveOneOut

X = [1,2,3,4]

loo = LeaveOneOut()

for train, testin loo.split(X):

print("%s %s" % (train, test))

'''Cross-validation iterators with stratification based on class labels样本标签非平衡问题'''

'''1.StratifiedKFold is a variation of k-fold which returns stratified folds:

each set contains approximately the same percentage of samples of each target class as the complete set.'''

from sklearn.model_selectionimport StratifiedKFold

X = np.ones(10)

y = [0,0,0,0,1,1,1,1,1,1]

skf = StratifiedKFold(n_splits=3)

for train, testin skf.split(X, y):

print("%s %s" % (train, test))

#RepeatedStratifiedKFold :

#can be used to repeat Stratified K-Fold n times with different randomization in each repetition.

'''Cross-validation iterators for grouped data'''

'''GroupKFold is a variation of k-fold which

ensures that the same group is not represented in both testing and training sets. '''

from sklearn.model_selectionimport GroupKFold

X = [0.1,0.2,2.2,2.4,2.3,4.55,5.8,8.8,9,10]

y = ["a","b","b","b","c","c","c","d","d","d"]

groups = [1,1,1,2,2,2,3,3,3,3]

gkf = GroupKFold(n_splits=3)

for train, testin gkf.split(X, y,groups=groups):

print("%s %s" % (train, test))

上一篇 下一篇

猜你喜欢

热点阅读