sklearn模型评估--Cross-validation
Cross--validation: evaluating estimator performance
出处:https://scikit-learn.org/stable/modules/cross_validation.html
import numpyas np
from sklearn.model_selectionimport train_test_split,cross_val_score
from sklearnimport svm,datasets
from sklearnimport preprocessing
'''官方文档说明:https://scikit-learn.org/stable/modules/cross_validation.html'''
iris = datasets.load_iris()
iris.data.shape, iris.target.shape
#((150, 4), (150,))
'''交叉验证-----法一:train_test_split'''
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,test_size=0.4,random_state=0)
X_train.shape, y_train.shape
#((90, 4), (90,))
X_test.shape, y_test.shape
#((60, 4), (60,))
clf = svm.SVC(kernel='linear',C=1).fit(X_train, y_train)
clf.score(X_test, y_test)
'''The simplest way to use cross-validation is to call the cross_val_score helper function
on the estimator and the dataset.'''
#交叉验证-----法e二:cross_val_score 喂入全部数据 通过cv设定
clf = svm.SVC(kernel='linear',C=1)
scores = cross_val_score(clf, iris.data, iris.target,cv=5,scoring='f1_macro')
#scoring='score'默认
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() *2))
#Accuracy: 0.98 (+/- 0.03)
'''对验证集和训练集同时标准化处理'''
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,test_size=0.4,random_state=0)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)#0.9333...
''' Pipeline 对标准化处理和交叉验证同时进行'''
from sklearn.pipelineimport make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
cross_val_score(clf, iris.data, iris.target,cv=5)
#-----------cross_validate----------
''' The cross_validate function and multiple metric evaluation:不同于cross_val_score,允许评估多个值'''
from sklearn.model_selectionimport cross_validate
from sklearn.metricsimport recall_score
scoring = ['precision_macro','recall_macro']
clf = svm.SVC(kernel='linear',C=1,random_state=0)
scores = cross_validate(clf, iris.data, iris.target,scoring=scoring,cv=5,return_train_score=False)
sorted(scores.keys())
#['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']
print(scores['test_recall_macro'])
#--------Cross validation iterators------------
'''根据不同的交叉验证策略生成数据集分割。应用于独立同分布'''
'''1.KFold--Note that KFold is not affected by classes or groups,of equal sizes (if possible)'''
from sklearn.model_selectionimport KFold
X = ["a","b","c","d"]
kf = KFold(n_splits=2)
for train, testin kf.split(X):
print("%s %s" % (train, test))
'''2.Repeated KFold--repeats Stratified K-Fold n times with different randomization in each repetition'''
from sklearn.model_selectionimport RepeatedKFold
X = np.array([[1,2], [3,4], [1,2], [3,4]])
random_state =12883823
rkf = RepeatedKFold(n_splits=2,n_repeats=2,random_state=random_state)
for train, testin rkf.split(X):
print("%s %s" % (train, test))
'''3.Each learning set is created by taking all the samples except one,
the test set being the sample left out.'''
'''作为一般规则,大多数作者和经验证据表明,5或10倍的交叉验证应该优先于LOO。'''
from sklearn.model_selectionimport LeaveOneOut
X = [1,2,3,4]
loo = LeaveOneOut()
for train, testin loo.split(X):
print("%s %s" % (train, test))
'''Cross-validation iterators with stratification based on class labels样本标签非平衡问题'''
'''1.StratifiedKFold is a variation of k-fold which returns stratified folds:
each set contains approximately the same percentage of samples of each target class as the complete set.'''
from sklearn.model_selectionimport StratifiedKFold
X = np.ones(10)
y = [0,0,0,0,1,1,1,1,1,1]
skf = StratifiedKFold(n_splits=3)
for train, testin skf.split(X, y):
print("%s %s" % (train, test))
#RepeatedStratifiedKFold :
#can be used to repeat Stratified K-Fold n times with different randomization in each repetition.
'''Cross-validation iterators for grouped data'''
'''GroupKFold is a variation of k-fold which
ensures that the same group is not represented in both testing and training sets. '''
from sklearn.model_selectionimport GroupKFold
X = [0.1,0.2,2.2,2.4,2.3,4.55,5.8,8.8,9,10]
y = ["a","b","b","b","c","c","c","d","d","d"]
groups = [1,1,1,2,2,2,3,3,3,3]
gkf = GroupKFold(n_splits=3)
for train, testin gkf.split(X, y,groups=groups):
print("%s %s" % (train, test))