sklearn常用函数
2018-11-08 本文已影响0人
DDDDavid
数据标准化
# 从sklearn.preprocessing里导入StandardScaler。
from sklearn.preprocessing import StandardScaler
# 标准化数据,保证每个维度的特征数据方差为1,均值为0。使得预测结果不会被某些维度过大的特征值而主导。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
数据分割函数
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_array, y_list, test_size=0.2)
模型训练、预测、计算准确率
from sklearn.linear_model import SGDClassifier
lr = SGDClassifier(loss='log', penalty='l1')
#模型训练
lr.fit(x_train, y_train)
#模型预测
lr_pre=lr.predict(x_test)
#计算准确率
print('Test Accuracy: %.6f'%lr.score(x_test, y_test))
计算精确率、召回率、F1分
# 从sklearn.metrics里导入classification_report模块。
from sklearn.metrics import classification_report
# 利用classification_report模块获得LogisticRegression其他三个指标的结果。target_names可省略。
print (classification_report(y_test, lr_pre, target_names=['5','4','3','2','1']))
Pipelines
>>> from sklearn.pipeline import Pipeline
>>> from sklearn.svm import SVC
>>> from sklearn.decomposition import PCA
>>> estimators = [('reduce_dim', PCA()), ('clf', SVC())]
>>> pipe = Pipeline(estimators)
>>> pipe
Pipeline(memory=None,
steps=[('reduce_dim', PCA(copy=True,...)),
('clf', SVC(C=1.0,...))])