PyTrch深度学习简明实战3 - 多层感知器- HR数据集分析

2023-03-10  本文已影响0人  薛东弗斯

https://zhuanlan.zhihu.com/p/64781896

1. 数据集预处理

import torch
import pandas as pd
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib_inline  # 以%开头的表示魔术方法,直接把绘图显示在jupyter notebook页面中。如果移植到pycharm中,该语句应该改为plt.show()

# 1. 数据预处理
data = pd.read_csv('./HR.csv')
# print(data.info())  # 显示有那些类
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 14999 entries, 0 to 14998
# Data columns (total 10 columns):
#  #   Column                 Non-Null Count  Dtype
# ---  ------                 --------------  -----
#  0   satisfaction_level     14999 non-null  float64
#  1   last_evaluation        14999 non-null  float64
#  2   number_project         14999 non-null  int64
#  3   average_montly_hours   14999 non-null  int64
#  4   time_spend_company     14999 non-null  int64
#  5   Work_accident          14999 non-null  int64
#  6   left                   14999 non-null  int64
#  7   promotion_last_5years  14999 non-null  int64
#  8   part                   14999 non-null  object    # opbject代表python object
#  9   salary                 14999 non-null  object
# dtypes: float64(2), int64(6), object(2)
# memory usage: 1.1+ MB
# None

# print(data.part.unique())   #显示有那些部门
# ['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT' 'product_mng' 'marketing' 'RandD']
# object文本信息,需要数值化。

# print(data.salary.unique())   #显示有那些收入分类
# ['low' 'medium' 'high']

# print(data.groupby(['salary','part']).size())   # 先按照salrary分组,后按照part部门分组,然后统计数量
# salary  part
# high    IT               83
#         RandD            51
#         accounting       74
#         hr               45
#         management      225
#         marketing        80
#         product_mng      68
#         sales           269
#         support         141
#         technical       201
# low     IT              609
#         RandD           364
#         accounting      358
#         hr              335
#         management      180
#         marketing       402
#         product_mng     451
#         sales          2099
#         support        1146
#         technical      1372
# medium  IT              535
#         RandD           372
#         accounting      335
#         hr              359
#         management      225
#         marketing       376
#         product_mng     383
#         sales          1772
#         support         942
#         technical      1147
# dtype: int64

# pd.get_dummies() 方法,会将object类型的数据变成独热编码
# print(pd.get_dummies(data.salary))
#        high  low  medium
# 0         0    1       0
# 1         0    0       1
# 2         0    0       1
# 3         0    1       0
# 4         0    1       0
# ...     ...  ...     ...
# 14994     0    1       0
# 14995     0    1       0
# 14996     0    1       0
# 14997     0    1       0
# 14998     0    1       0
#
# [14999 rows x 3 columns]

data = data.join(pd.get_dummies(data.salary))   # 将分类后的3列join到data数据集
# print(data.info)   # 在原有的数据集后面增加3列,high/low/medium
# bound method DataFrame.info of        satisfaction_level  last_evaluation  number_project  ...  high  low  medium
# 0                    0.38             0.53               2  ...     0    1       0
# 1                    0.80             0.86               5  ...     0    0       1
# 2                    0.11             0.88               7  ...     0    0       1
# 3                    0.72             0.87               5  ...     0    1       0
# 4                    0.37             0.52               2  ...     0    1       0
# ...                   ...              ...             ...  ...   ...  ...     ...
# 14994                0.40             0.57               2  ...     0    1       0
# 14995                0.37             0.48               2  ...     0    1       0
# 14996                0.37             0.53               2  ...     0    1       0
# 14997                0.11             0.96               6  ...     0    1       0
# 14998                0.37             0.52               2  ...     0    1       0
del data['salary']  # 删除原有的salary这一列
data = data.join(pd.get_dummies(data.part)) # 同样的方法处理part这一列
del data['part']
# print(data.info)  #[14999 rows x 21 columns]>

# 获取Y,员工是否离职
# print(data.left.value_counts())   # 该数据并非均匀分,离职的只占一小部分, 模型预测为全部不离职的概率 若等于11428/16998, 则
# 该模型没有任何用处,只不过碰巧而已。 如果模型准确率没有高于11428/16998,则没有任何用处
# 0    11428
# 1     3571
# Name: left, dtype: int64

Y_data = data.left.values.reshape(-1,1)   # 第2维是1,第1维自动计算
# print(Y_data.shape)   # (14999, 1)
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
X_data = data[[c for c in data.columns if c != 'left']].values   #双中括号取值
X = torch.from_numpy(X_data).type(torch.FloatTensor)
# print(X.shape)  #torch.Size([14999, 20])

2. 创建多层感知机模型

# 2. 自定义模型
# 创建模型
# 自定义类型
# nn.Module: 继承之歌类
# __init__: 初始化所有的层
# forward:  定义模型运算过程(前向传播过程)
class Model(nn.Module):
    def __init__(self):
        super().__init__()  #继承父类的所有属性
        self.linear_1 = nn.Linear(20,64)   # 输入X有20列,因此有20个特征。 假如中间的隐藏层有64个特征,即创建64个特征的中间层
        self.linear_2 = nn.Linear(64,64)   # 第2层的输入就是第1层的输出,第一层隐藏层有64个特征,因此第2层的输入特征为64,中间的隐藏层还是64
        self.linear_3 = nn.Linear(64,1)    # 第3层的输入是第2层的输出,还是64 . 输出用于逻辑回归,因此输出特征是1,二分类
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    def forward(self,input):   # forward函数去调用这些层
        x = self.linear_1(input)     # 第一层调用
        x = self.relu(x)             # 调用以后,要通过relu激活
        x = self.linear_2(x)
        x = self.relu(x)
        x = self.linear_3(x)
        x = self.sigmoid(x)
        return x

model = Model()
# print(model)
# # Model(
# #   (linear_1): Linear(in_features=20, out_features=64, bias=True)
# #   (linear_2): Linear(in_features=64, out_features=64, bias=True)
# #   (linear_3): Linear(in_features=64, out_features=1, bias=True)
# #   (relu): ReLU()
# #   (sigmoid): Sigmoid()
# # )
  1. 改写模型与解释
# 3. 模型改写与解释

import torch.nn.functional as F
# F.relu(x)      直接只有函数式调用激活函数。 F本身也包含层的概念
# F.sigmoid(x)
# F 相对于nn,是稍微低阶一点的api
# 改写后的代码,看起来更简洁
class Model(nn.Module):
    def __init__(self):
        super().__init__()  #继承父类的所有属性
        self.linear_1 = nn.Linear(20,64)   # 输入X有20列,因此有20个特征。 假如中间的隐藏层有64个特征,即创建64个特征的中间层
        self.linear_2 = nn.Linear(64,64)   # 第2层的输入就是第1层的输出,第一层隐藏层有64个特征,因此第2层的输入特征为64,中间的隐藏层还是64
        self.linear_3 = nn.Linear(64,1)    # 第3层的输入是第2层的输出,还是64 . 输出用于逻辑回归,因此输出特征是1,二分类
    def forward(self,input):   # forward函数去调用这些层
        x = F.relu(self.linear_1(input) )    # 第一层调用
        x = F.relu(self.linear_2(x))
        x = F.sigmoid(self.linear_3(x))
        return x

model = Model()
# print(model)   # 发现相比较修改前,少了relu层与sigmod层   20个输入特征;隐藏层64个
# # Model(
# #   (linear_1): Linear(in_features=20, out_features=64, bias=True)
# #   (linear_2): Linear(in_features=64, out_features=64, bias=True)
# #   (linear_3): Linear(in_features=64, out_features=1, bias=True)
# # )     

lr = 0.0001

def get_model():
    model = Model()
    opt = torch.optim.Adam(model.parameters(),lr=lr)   # 优化方法
    return model, opt

model,optim = get_model()

4. 定义损失函数

# 4. 定义损失函数
loss_fn = nn.BCELoss()
batch = 64
no_of_batches = len(data)//batch
epochs = 100
for epoch in range(epochs):
    for i in range(no_of_batches):
        start = i*batch
        end = start + batch
        x = X[start:end]
        y = Y[start:end]
        y_pred = model(x)
        loss = loss_fn(y_pred,y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        print('epoch: ',epoch, 'loss: ',loss_fn(model(X),Y).data.item())


# epoch:  0 loss:  0.7169897556304932
# epoch:  1 loss:  0.7474398612976074
# epoch:  2 loss:  0.7510539889335632
# epoch:  3 loss:  0.7487813830375671
# epoch:  4 loss:  0.7463597655296326
# epoch:  5 loss:  0.7404094934463501
# epoch:  6 loss:  0.7263101935386658
# epoch:  7 loss:  0.7180029153823853
# epoch:  8 loss:  0.703711211681366
# epoch:  9 loss:  0.686420738697052
# epoch:  10 loss:  0.6773249506950378
# epoch:  11 loss:  0.6685119271278381
# epoch:  12 loss:  0.6593860387802124
# epoch:  13 loss:  0.6474876999855042
# epoch:  14 loss:  0.6377690434455872
# epoch:  15 loss:  0.6245443820953369
# epoch:  16 loss:  0.6224199533462524
# epoch:  17 loss:  0.6104769110679626
# epoch:  18 loss:  0.6039217114448547
# epoch:  19 loss:  0.5974984169006348
# epoch:  20 loss:  0.591426432132721
# epoch:  21 loss:  0.5852314829826355
# epoch:  22 loss:  0.5801721215248108
# epoch:  23 loss:  0.6171563267707825
# epoch:  24 loss:  0.5970718860626221
# epoch:  25 loss:  0.5865799784660339
# epoch:  26 loss:  0.5799813866615295
# epoch:  27 loss:  0.5736305713653564
# epoch:  28 loss:  0.5689525008201599
# epoch:  29 loss:  0.5653705596923828
# epoch:  30 loss:  0.5621749758720398
# epoch:  31 loss:  0.5603551268577576
# epoch:  32 loss:  0.5558130741119385
# epoch:  33 loss:  0.554741621017456
# epoch:  34 loss:  0.5537745952606201
# epoch:  35 loss:  0.5537812113761902
# epoch:  36 loss:  0.5566486716270447
# epoch:  37 loss:  0.5538275241851807
# epoch:  38 loss:  0.5533592700958252
# epoch:  39 loss:  0.5531929731369019
# epoch:  40 loss:  0.5532180666923523
# epoch:  41 loss:  0.5534858107566833
# epoch:  42 loss:  0.5535756349563599
# epoch:  43 loss:  0.5538690090179443
# epoch:  44 loss:  0.5541990995407104
# epoch:  45 loss:  0.5545604228973389
# epoch:  46 loss:  0.555756688117981
# epoch:  47 loss:  0.5562994480133057
# epoch:  48 loss:  0.5558972358703613
# epoch:  49 loss:  0.5573424100875854
# epoch:  50 loss:  0.5569452047348022
# epoch:  51 loss:  0.5557860136032104
# epoch:  52 loss:  0.5561386346817017
# epoch:  53 loss:  0.5558918118476868
# epoch:  54 loss:  0.5567163228988647
# epoch:  55 loss:  0.5555370450019836
# epoch:  56 loss:  0.5590798854827881
# epoch:  57 loss:  0.5561874508857727
# epoch:  58 loss:  0.5567173361778259
# epoch:  59 loss:  0.5563571453094482
# epoch:  60 loss:  0.5565804243087769
# epoch:  61 loss:  0.5580364465713501
# epoch:  62 loss:  0.5550894141197205
# epoch:  63 loss:  0.5556345582008362
# epoch:  64 loss:  0.5544098615646362
# epoch:  65 loss:  0.554726243019104
# epoch:  66 loss:  0.5676969885826111
# epoch:  67 loss:  0.5639616847038269
# epoch:  68 loss:  0.5534506440162659
# epoch:  69 loss:  0.5482341647148132
# epoch:  70 loss:  0.5490630269050598
# epoch:  71 loss:  0.550289511680603
# epoch:  72 loss:  0.558487057685852
# epoch:  73 loss:  0.5513705611228943
# epoch:  74 loss:  0.5461589097976685
# epoch:  75 loss:  0.5472497940063477
# epoch:  76 loss:  0.5445483922958374
# epoch:  77 loss:  0.5462327599525452
# epoch:  78 loss:  0.5444859862327576
# epoch:  79 loss:  0.5440321564674377
# epoch:  80 loss:  0.5433013439178467
# epoch:  81 loss:  0.5420659184455872
# epoch:  82 loss:  0.5504337549209595
# epoch:  83 loss:  0.5655510425567627
# epoch:  84 loss:  0.5413082242012024
# epoch:  85 loss:  0.5385186672210693
# epoch:  86 loss:  0.5376941561698914
# epoch:  87 loss:  0.5360188484191895
# epoch:  88 loss:  0.5352768898010254
# epoch:  89 loss:  0.5340187549591064
# epoch:  90 loss:  0.532996416091919
# epoch:  91 loss:  0.5334175229072571
# epoch:  92 loss:  0.5310117602348328
# epoch:  93 loss:  0.5338006615638733
# epoch:  94 loss:  0.5294851064682007
# epoch:  95 loss:  0.5323388576507568
# epoch:  96 loss:  0.5423325300216675
# epoch:  97 loss:  0.5269168615341187
# epoch:  98 loss:  0.5257655382156372
# epoch:  99 loss:  0.5224730372428894

5. 使用dataset进行重构

# 使用dataset进行重构
# PyTorch有一个抽象的Dataset类。Dataset可以是任何具有__len__函数和__getitem__作为对其进行索引的方法的函数。 本教程将通过示例将自定义HRDataset类创建为的Dataset的子类。

# PyTorch的TensorDataset 是一个包装张量的Dataset。通过定义索引的长度和方式,这也为我们提供了沿张量的第一维进行迭代,索引和切片的方法。这将使我们在训练的同一行中更容易访问自变量和因变量
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
import pandas as pd

data = pd.read_csv('./HR.csv')
data = data.join(pd.get_dummies(data.salary))   # 将分类后的3列join到data数据集
del data['salary']  # 删除原有的salary这一列
data = data.join(pd.get_dummies(data.part)) # 同样的方法处理part这一列
del data['part']

Y_data = data.left.values.reshape(-1,1)   # 第2维是1,第1维自动计算
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
X_data = data[[c for c in data.columns if c != 'left']].values   #双中括号取值
X = torch.from_numpy(X_data).type(torch.FloatTensor)
HRdataset = TensorDataset(X, Y)       # Create HR Dataset

class Model(nn.Module):
    def __init__(self):
        super().__init__()  #继承父类的所有属性
        self.linear_1 = nn.Linear(20,64)   # 输入X有20列,因此有20个特征。 假如中间的隐藏层有64个特征,即创建64个特征的中间层
        self.linear_2 = nn.Linear(64,64)   # 第2层的输入就是第1层的输出,第一层隐藏层有64个特征,因此第2层的输入特征为64,中间的隐藏层还是64
        self.linear_3 = nn.Linear(64,1)    # 第3层的输入是第2层的输出,还是64 . 输出用于逻辑回归,因此输出特征是1,二分类
    def forward(self,input):   # forward函数去调用这些层
        x = F.relu(self.linear_1(input) )    # 第一层调用
        x = F.relu(self.linear_2(x))
        x = F.sigmoid(self.linear_3(x))
        return x

model = Model()
lr = 0.0001

def get_model():
    model = Model()
    opt = torch.optim.Adam(model.parameters(),lr=lr)   # 优化方法
    return model, opt

model,opt = get_model()

loss_fn = nn.BCELoss()
batch = 64
no_of_batches = len(data)//batch
epochs = 100

for epoch in range(epochs):
    for i in range(no_of_batches):
        x, y = HRdataset[i * batch: i * batch + batch]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
    with torch.no_grad():
        print('epoch:', epoch, '   ', 'loss:', loss_fn(model(X), Y))

6. 使用dataloader进行重构

# 使用dataloader进行重构
# Pytorch DataLoader负责管理批次。
# DataLoader从Dataset创建。
# DataLoader使遍历批次变得更容易。DataLoader会自动为我们提供每个小批量。
# 无需使用 HRdataset[i * batch: i * batch + batch]
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd

data = pd.read_csv('./HR.csv')
data = data.join(pd.get_dummies(data.salary))   # 将分类后的3列join到data数据集
del data['salary']  # 删除原有的salary这一列
data = data.join(pd.get_dummies(data.part)) # 同样的方法处理part这一列
del data['part']

Y_data = data.left.values.reshape(-1,1)   # 第2维是1,第1维自动计算
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
X_data = data[[c for c in data.columns if c != 'left']].values   #双中括号取值
X = torch.from_numpy(X_data).type(torch.FloatTensor)

class Model(nn.Module):
    def __init__(self):
        super().__init__()  #继承父类的所有属性
        self.linear_1 = nn.Linear(20,64)   # 输入X有20列,因此有20个特征。 假如中间的隐藏层有64个特征,即创建64个特征的中间层
        self.linear_2 = nn.Linear(64,64)   # 第2层的输入就是第1层的输出,第一层隐藏层有64个特征,因此第2层的输入特征为64,中间的隐藏层还是64
        self.linear_3 = nn.Linear(64,1)    # 第3层的输入是第2层的输出,还是64 . 输出用于逻辑回归,因此输出特征是1,二分类
    def forward(self,input):   # forward函数去调用这些层
        x = F.relu(self.linear_1(input) )    # 第一层调用
        x = F.relu(self.linear_2(x))
        x = F.sigmoid(self.linear_3(x))
        return x

model = Model()
lr = 0.0001

def get_model():
    model = Model()
    opt = torch.optim.Adam(model.parameters(),lr=lr)   # 优化方法
    return model, opt

model,opt = get_model()

loss_fn = nn.BCELoss()
batch = 64
no_of_batches = len(data)//batch
epochs = 100

# 使用dataloader
HR_ds = TensorDataset(X, Y)
# HR_dl = DataLoader(HR_ds, batch_size=batch, shuffle=True)  # 打乱数据集,有利于提升loss
HR_dl = DataLoader(HR_ds, batch_size=batch, shuffle=False)
for epoch in range(epochs):
    for x, y in HR_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
    with torch.no_grad():
        print('epoch:', epoch, '   ', 'loss:', loss_fn(model(X), Y))
# 使用dataset方式,最后损失 epoch: 99     loss: tensor(0.5242)
# 使用dataloader+shuffle=False,最后损失 epoch: 99     loss: tensor(0.4920)
# 使用dataloader+shuffle=True,最后损失 epoch: 99     loss: tensor(0.2838)
上一篇 下一篇

猜你喜欢

热点阅读