1. Data preprocessing

2019-11-10  本文已影响0人  Hack_J

Shown below is a Data Preprocessing Template

    # -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 


#import the dataset 
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1:].values

#handle missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN",strategy="mean",axis=0) #0-> along col
imputer = imputer.fit(X[:,1:3])
X[: , 1:3] = imputer.transform(X[: , 1:3])

#handle categary variables
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelEncoder_x = LabelEncoder()
X[:,0] = labelEncoder_x.fit_transform(X[:,0])
oneHotEncoder = OneHotEncoder(categorical_features = [0])
X = oneHotEncoder.fit_transform(X).toarray()
labelEncoder_y = LabelEncoder()
y = labelEncoder_y.fit_transform(y)

#Split the dataset into Traning set and Test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)


#Scaling (Standardisation or Normalisation)
#Standardisation -> x = (x-mean(x))/standardDeviation(x)
#x = x与平均数的差和标准差的比  标准差事方差的sqrt 
#方差是与平均数差的平方的平均
#Normalisation -> X = (x - min(x))/(max(x)-min(x))
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
x_test = sc_X.transform(x_test)
上一篇 下一篇

猜你喜欢

热点阅读