数据预处理部分代码
2020-02-03 本文已影响0人
bokli_dw
step1. 导库
import numpy as np
import pandas as pd
step2 导数据集
datasets = pd.read_csv('C:/Users/CCLU/Desktop/100-Days-Of-ML-Code-master/100-Days-Of-ML-Code-master/datasets/Data.csv')
X = datasets.iloc[:,:-1].values
Y = datasets.iloc[:,3].values
print(X)
print(Y)
print('\n')
step3. handle the missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN",strategy = 'mean',axis=0)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
print(X)
step4 .Encoding categorical data
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_x = LabelEncoder()
X[:,0] = labelencoder_x.fit_transform(X[:,0])
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_y = LabelEncoder()
Y = labelencoder_y.fit_transform(Y)
print(X)
print(Y)
print('\n')%打印列表可以查看到:所有表格中的数据均被转换为数值形式
step5 划分训练集和测试集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0)%random_state就是为了保证程序每次运行都分割一样的训练集合测试集。否则,同样的算法模型在不同的训练集和测试集上的效果不一样
print(x_train)
print(y_train)
print('\n')
step6 特征缩放
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
x_test = sc_X.transform(x_test)
print(x_train)
print(x_test)