2020-07-17 暑期学习日更计划 (李宏毅2020-hw1
2020-07-16 本文已影响0人
Reza_
ML2020spring - hw1
该作业kaggle地址:ML2020spring - hw1 Regression - PM2.5 Prediction
pytorch入门系列,把之前用numpy实现的神经网络部分改成了pytorch,数据预处理的方法没有改。
数据预处理部分
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv('./lhy_DL_Hw/train.csv',encoding='big5')
data=data.iloc[:,3:]
data[data=='NR']=0
raw_data=data.to_numpy()
month_data={}
for month in range(12):
sample=np.empty([18,480])
for day in range(20):
sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
month_data[month]=sample
x=np.empty([12*471,18*9],dtype=float)
y=np.empty([12*471,1],dtype=float)
for month in range(12):
for day in range(20):
for hour in range(24):
if day == 19 and hour > 14:
continue
x[month * 471 + day * 24 + hour, :] = month_data[month][:, day * 24 + hour: day * 24 + hour + 9].reshape(1,
-1)
# 拉平month_data,每次从month_data中获取9个特征,赋值到x中
y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9]
# y获取month_data的第九行,PM2.5的值作为标签值
mean_x = np.mean(x, axis = 0) #18 * 9
std_x = np.std(x, axis = 0) #18 * 9
mean_y=np.mean(y,axis=0)
std_y=np.std(y,axis=0)
#Z-Score标准化
for i in range(len(x)): #12 * 471
for j in range(len(x[0])): #18 * 9
if std_x[j] != 0:
x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]
搭建网络部分
x_train=torch.from_numpy(x).float()
y_train=torch.from_numpy(y).float()
print(x_train.shape,y_train.shape)
learning_rate=0.03
input_size=162
output_size=1
num_epoches=4000
batch_size=1
model = nn.Linear(input_size, output_size)
#只有一层的线性模型
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_list=[]
开始训练
for epoch in range(num_epoches):
# print('inputs=',inputs.shape,'label=',labels.shape)
# print(inputs,labels)
optimizer.zero_grad()
outputs=model(x_train)
loss=criterion(outputs,y_train)
loss=torch.sqrt(loss)
loss.backward()
optimizer.step()
loss_list.append(loss.item())
if epoch%100==0:
print(epoch,loss.item())
plt.plot(np.arange(0,num_epoches,1),loss_list[:num_epoches:1])
plt.show()
print("Finsh Training")
开始测试:
testdata = pd.read_csv('./lhy_DL_Hw/test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
#标准化
for i in range(len(test_x)):
for j in range(len(test_x[0])):
if std_x[j] != 0:
test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x=torch.from_numpy(test_x).float()
test_outputs=model(test_x)
把测试集中得到的值写入到csv文件中
import csv
with open('submit.csv', mode='w', newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id', 'value']
# print(header)
csv_writer.writerow(header)
for i in range(240):
row = ['id_' + str(i), test_outputs[i][0].item()]
csv_writer.writerow(row)
# print(row)
线性回归模型实现!