神经网络学习:LSTM对沪深300收益率预测
2019-11-27 本文已影响0人
黄yy家的jby
1-导库
# 常见库
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import os
# 数据预处理,sts是为了adf检验,acf是用来看滞后几期比较合适
import statsmodels.tsa.stattools as sts
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import acf
# 构建LSTM 所需库
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from sklearn.metrics import classification_report
2-数据导入
df_hs : 沪深300的相关数据:收盘价,换手率,成交量,股息率,最大振幅
df_marco : 各宏观指标
考虑披露时效,指标滞后一期(季度滞后3个月,月度滞后一个月)
def read_data(open_path):
df_hs = pd.read_excel(open_path+'hs300_zhenfu.xlsx',index_col=0)
df_hs.columns = ['g','close','vol','ts','high','low']
df_hs['zhenfu'] = 100*(df_hs['high']-df_hs['low'])/df_hs['close']
df_hs['chg'] = 100*df_hs['close'].pct_change(1)
df_hs.loc[:,df_hs.columns != 'chg'] = df_hs.loc[:,df_hs.columns != 'chg'].shift(1)
df_hs.dropna(inplace=True,axis=0)
df_marco = pd.read_excel(open_path+'宏观数据.xls',index_col=0)
df_marco.columns = ['gdp','ppi','cpi','pmi','industry_add','investment','current_deposit',\
'fixed_deposit_3','fixed_deposit_6','mlf','slf','shibor','rf_1','rf_10',\
'm1','m2','inter_borrow','dollar','dollar_rmb','yen_rmb','social_financing',\
'foreign_reserve','au','rare','aa+','aa']
df_marco['m1_m2'] = df_marco['m1'] - df_marco['m2']
df_marco['qixian'] = df_marco['rf_10'] - df_marco['rf_1']
df_marco['xinyong_+'] = df_marco['aa+'] - df_marco['rf_1']
df_marco['xinyong'] = df_marco['aa'] - df_marco['rf_1']
# 滞后一期,先resample到月,向后推一个月,在resample到天
day_ = df_marco[['shibor','rf_1','rf_10','dollar','dollar_rmb','yen_rmb','au','aa+','aa','qixian','xinyong_+','xinyong']]
month_ = df_marco[['ppi','cpi','pmi','industry_add','investment','current_deposit','fixed_deposit_3','fixed_deposit_6',\
'slf','m1','m2','m1_m2','inter_borrow','social_financing','rare']]
quarter_ = df_marco[['gdp','foreign_reserve']]
day_[['dollar','dollar_rmb','yen_rmb','au']] = 100*day_[['dollar','dollar_rmb','yen_rmb','au']].pct_change(1)
day_ = day_.shift(1)
month_ = month_.resample('M').mean()
month_.fillna(method='ffill',inplace=True)
month_['pmi'] = 100*month_['pmi'].pct_change(12)
month_['social_financing'] = 100*month_['social_financing'].pct_change(1)
month_ = month_.shift(1)
month_ = month_.resample('d').pad()
quarter_ = quarter_.resample('3M').mean()
quarter_.fillna(method='ffill',inplace=True)
quarter_['foreign_reserve'] = 100*quarter_['foreign_reserve'].pct_change(1)
quarter_ = quarter_.shift(1)
quarter_ = quarter_.resample('d').pad()
df = pd.concat([day_,month_,quarter_],axis=1)
df = df.fillna(method='ffill')
df = df.loc[df_hs.index]
return df_hs, df,df_marco
3- 数据整合
计算技术指标数据
定义x(宏观指标,资产配置指标,利率指标,外汇指标,量价指标,滞后项)和y(沪深300日收益率)
将y按照分位数进行6等分类
def cal_ema(df, N):
a = 2/(N+1)
b = pd.DataFrame(columns = ['close'], index=df.index)
for i in range(len(df)):
if i == 0:
b.iloc[i] = df['close'].iloc[i]
else:
b.iloc[i] = a * df['close'].iloc[i] + (1-a) * b.iloc[i-1]
return b
def cal_dea(df, short_t=12, long_t=26 ,avg_t=9):
ema_short = cal_ema(df, short_t)
ema_long = cal_ema(df, long_t)
dif = ema_short - ema_long
dea = cal_ema(dif, avg_t)
df['macd'] = (dif-dea)*2
return df
def cal_adx(df, N=14, M=6):
hd = df['high'].diff().dropna()
ld = -df['low'].diff().dropna()
dmp = pd.DataFrame({'dmp': [0] * len(hd)}, index=hd.index)
dmp[(hd > 0) & (ld < 0)] = hd
dmp = dmp.rolling(N).sum().dropna()
dmm = pd.DataFrame({'dmm': [0] * len(ld)}, index=ld.index)
dmm[(hd < 0) & (ld > 0)] = ld
dmm = dmm.rolling(N).sum().dropna()
temp = pd.concat([df['high'] - df['low'], abs(df['high'] - df['close'].shift(1)), \
abs(df['low'] - df['close'].shift(1))], axis=1).dropna()
tr = temp.max(axis=1).dropna()
s_index = dmm.index & tr.index & dmp.index
dmp = dmp.loc[s_index]
dmm = dmm.loc[s_index]
tr = tr.loc[s_index]
pdi = 100 * dmp['dmp'] / tr
mdi = dmm['dmm'] * 100 / tr
dx = abs(pdi - mdi) / (pdi + mdi) * 100
adx = dx.rolling(M).mean().dropna()
adx = pd.DataFrame(adx, columns=['adx'])
return adx
def handle_hs(df_hs):
df_hs = cal_dea(df_hs)
df_hs['adx'] = cal_adx(df_hs)
df_hs.dropna(inplace=True)
return df_hs
def handle_data(df_hs,df_marco):
df_hs = handle_hs(df_hs)
df_marco = df_marco.loc[df_hs.index]
df_hs = df_hs[df_hs.index.year>=2009]
df_marco = df_marco[df_marco.index.year>=2009]
#重新定义y 和 x
y = df_hs.loc[:,'chg']
# 找分位数
for i in range(1,6):
print( y.quantile(i/6))
print(i/6)
print("")
y_new = y.copy()
y_new[1.24 <= y_new] = 5
y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
y_new[y_new < -1.04] = 0
y_new.columns = ['label']
y_new_count = y_new.groupby(y_new).count()
y_new_count =pd.concat([ pd.Series(['<-1.04','-1.04~-0.37','-0.37~-0.06',\
'0.06~0.46','0.46~1.24','>1.24']),y_new_count],axis=1)
y_new_count.columns = ['分位数定义','num']
y_new_count.to_excel(save_path+'lstm的y分类.xls')
#去除相关性不高的因子
x = pd.concat([df_hs[['g','vol','ts','zhenfu','macd','adx']],df_marco],axis=1)
x = x.astype('float')
dic = {}
for i in x.columns:
temp = x[i]
dic[i] = temp.corr(y)
print(i)
df = pd.DataFrame.from_dict(dic,orient='index')
df.to_excel(save_path+'LSTM因子相关性.xls')
df_temp = df[abs(df)>0.01]
df_temp.dropna(inplace=True)
target = ['g','vol','ts','zhenfu','macd','adx','shibor','rf_10','dollar','aa','xinyong','cpi','m1_m2']
df_temp2 = df_temp.loc[target]
df_temp2.to_excel(save_path+'lstm筛选后因子相关性.xls')
x = x[target]
return x,y_new
4-检验数据
相关性过滤--单位根检验--PCA取主成因子(不存在共线性)
def again_handle_data(x,y,df_backup):
def adf_test(x,y):
dic = {}
dic['y'] = sts.adfuller(y)[1]
for i in x.columns:
dic[i] = sts.adfuller(x[i])[1]
df = pd.DataFrame.from_dict(dic,orient='index')
target = list(df[df>0.01].dropna().index)
if len(target)>0:
print(target)
else:
print('所有因子通过单位根检验')
return df,target
def adf_data(x, y, df_backup):
df1, target1 = adf_test(x, y)
for i in target1:
x[i] = x[i].diff(1)
x.dropna(inplace=True)
y = y[x.index]
# 重新diff cpi 和 m1-m2,然后赋值给x
month_ = df_backup[['cpi','m1_m2']]
month_.dropna(inplace=True)
month_ = month_.shift(1)
month_[['cpi','m1_m2']] = month_[['cpi','m1_m2']].diff(1)
month_.dropna(inplace=True)
for i in month_.columns:
print(sts.adfuller(month_[i])[1])
a = list(x.columns)
a.remove('cpi')
a.remove('m1_m2')
b = x[a]
c = pd.concat([b,month_],axis=1)
x = c.fillna(method='bfill')
x = x[x.index.year>=2009]
x.dropna(inplace=True)
s_index = x.index & y.index
x = x.loc[s_index]
y = y[s_index]
# df2, target2 = adf_test(x, y)
# df1.to_excel(save_path + 'lstm单位根检验.xls')
# df2.to_excel(save_path + 'lstm差分后单位根检验.xls')
return x, y
x,y = adf_data(x,y,df_backup)
x_new = preprocessing.scale(x,axis=0)
x_new = pd.DataFrame(x_new,index=x.index,columns=x.columns)
mat = np.dot(x_new.T, x_new)
l,eig,r = np.linalg.svd(mat)
eig = eig / eig.sum()
eig = eig.cumsum()
i = 0
for e in eig:
i += 1
if e > 0.99:
n_pca = i
break
print('最大特征数目:' + str(n_pca))
# b = pd.DataFrame(y)
y_new = to_categorical(y)
y_new = pd.DataFrame(y_new,index=y.index)
return x_new,y_new
5-数据准备
lstm要求数据输入有特定格式,n_steps=2是根据acf得到
def pre_data(x,y,n_steps=2):
data = np.hstack([np.array(x),np.array(y)])
n_feature = x.shape[1]
train_pos = 0.9
result = []
for s in range(len(data)-n_steps):
temp = data[s:s+n_steps]
result.append(temp)
result = np.array(result)
row = round(train_pos * result.shape[0])
x_train = result[:row, :, :-6]
x_test = result[row:, :, :-6]
y_train = result[:row, -1, -6:]
y_test = result[row:, -1, -6:]
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], n_feature))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], n_feature))
return x_train, x_test, y_train, y_test
6-模型建立
由于对数据有结构要求,要建立两层,需要return_sequences=True
def bulid_model(x_train, neurons=[128,128,6], dropout=0.2):
model = Sequential()
model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(neurons[2], activation='softmax'))
adam = keras.optimizers.Adam(decay=0.2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# 建立两层模型
# model = Sequential()
# model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
# model.add(Dropout(dropout))
# model .add(LSTM(neurons[1]))
# model.add(Dropout(dropout))
# model.add(Dense(neurons[2], activation='softmax'))
# adam = keras.optimizers.Adam(decay=0.2)
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()
return model
7-运行模型
def run_model(x_train,y_train,save_path, neurons=[128,128,9], dropout=0.2):
model = bulid_model(x_train, neurons=[128,128,6], dropout=0.2)
history = model.fit(x_train,y_train,epochs=32,batch_size=64,verbose=2)
fig = plt.figure()
ax = fig.add_subplot(111)
pic1 = ax.plot(history.history['loss'], label='loss',color='b')
ax2 = ax.twinx()
pic2 = ax2.plot(history.history['accuracy'],label='acc',color='r')
pic = pic1+pic2
labs = [l.get_label() for l in pic]
ax.legend(pic,labs,loc=2)
plt.savefig(save_path+'LSTM_loss.jpg',dpi=300)
plt.show()
plt.close()
def model_score(model, x_train, y_train, x_test, y_test):
trainScore = model.evaluate(x_train, y_train, verbose=0)
print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], np.sqrt(trainScore[0])))
testScore = model.evaluate(x_test, y_test, verbose=0)
print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], np.sqrt(testScore[0])))
return trainScore, testScore
8- 预测模型
def predict_model(model,x_test,y_test,save_path,x,train_pos=0.9):
y_pred_prob = model.predict(x_test)
#y_pred = tran_porb(y_pred_prob)
y_class = model.predict_classes(x_test)
y_pred = to_categorical(y_class)
target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
print(classification_report(y_test, y_pred, target_names=target))
9-保存模型
读取模型中 直接
from keras.models 有可能会报错,查询了一下貌似是keras版本太高的问题
from tensorflow.keras.models 既可以了
model.save(save_path+'lstm.h5')
# 读取模型
#from tensorflow.keras.models import load_model
#model = load_model(save_path+'lstm.h5'
完整代码
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import os
import statsmodels.tsa.stattools as sts
from sklearn import preprocessing
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
# from statsmodels.tsa.api import VAR
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.tsa.stattools import acf
from arch import arch_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from sklearn.metrics import classification_report
import ReadData as RD
import PreHandle as PH
import warnings
warnings.filterwarnings('ignore')
#%%
# 提取原始信息
def read_data(open_path):
df_hs = pd.read_excel(open_path+'hs300_zhenfu.xlsx',index_col=0)
df_hs.columns = ['g','close','vol','ts','high','low']
df_hs['zhenfu'] = 100*(df_hs['high']-df_hs['low'])/df_hs['close']
df_hs['chg'] = 100*df_hs['close'].pct_change(1)
df_hs.loc[:,df_hs.columns != 'chg'] = df_hs.loc[:,df_hs.columns != 'chg'].shift(1)
df_hs.dropna(inplace=True,axis=0)
df_marco = pd.read_excel(open_path+'宏观数据.xls',index_col=0)
df_marco.columns = ['gdp','ppi','cpi','pmi','industry_add','investment','current_deposit',\
'fixed_deposit_3','fixed_deposit_6','mlf','slf','shibor','rf_1','rf_10',\
'm1','m2','inter_borrow','dollar','dollar_rmb','yen_rmb','social_financing',\
'foreign_reserve','au','rare','aa+','aa']
df_marco['m1_m2'] = df_marco['m1'] - df_marco['m2']
df_marco['qixian'] = df_marco['rf_10'] - df_marco['rf_1']
df_marco['xinyong_+'] = df_marco['aa+'] - df_marco['rf_1']
df_marco['xinyong'] = df_marco['aa'] - df_marco['rf_1']
# 滞后一期,先resample到月,向后推一个月,在resample到天
day_ = df_marco[['shibor','rf_1','rf_10','dollar','dollar_rmb','yen_rmb','au','aa+','aa','qixian','xinyong_+','xinyong']]
month_ = df_marco[['ppi','cpi','pmi','industry_add','investment','current_deposit','fixed_deposit_3','fixed_deposit_6',\
'slf','m1','m2','m1_m2','inter_borrow','social_financing','rare']]
quarter_ = df_marco[['gdp','foreign_reserve']]
#day_.dropna(inplace=True)
# day_['dollar'] = day_['dollar'].pct_change(1)
# day_['dollar_rmb'] = day_['dollar_rmb'].pct_change(1)
# day_['yen_rmb'] = day_['yen_rmb'].pct_change(1)
day_[['dollar','dollar_rmb','yen_rmb','au']] = 100*day_[['dollar','dollar_rmb','yen_rmb','au']].pct_change(1)
day_ = day_.shift(1)
#day_.dropna(inplace=True)
month_ = month_.resample('M').mean()
month_.fillna(method='ffill',inplace=True)
month_['pmi'] = 100*month_['pmi'].pct_change(12)
month_['social_financing'] = 100*month_['social_financing'].pct_change(1)
month_ = month_.shift(1)
month_ = month_.resample('d').pad()
quarter_ = quarter_.resample('3M').mean()
quarter_.fillna(method='ffill',inplace=True)
quarter_['foreign_reserve'] = 100*quarter_['foreign_reserve'].pct_change(1)
quarter_ = quarter_.shift(1)
quarter_ = quarter_.resample('d').pad()
df = pd.concat([day_,month_,quarter_],axis=1)
df = df.fillna(method='ffill')
df = df.loc[df_hs.index]
return df_hs, df,df_marco
# 提取技术性指标
def cal_ema(df, N):
a = 2/(N+1)
b = pd.DataFrame(columns = ['close'], index=df.index)
for i in range(len(df)):
if i == 0:
b.iloc[i] = df['close'].iloc[i]
else:
b.iloc[i] = a * df['close'].iloc[i] + (1-a) * b.iloc[i-1]
return b
def cal_dea(df, short_t=12, long_t=26 ,avg_t=9):
ema_short = cal_ema(df, short_t)
ema_long = cal_ema(df, long_t)
dif = ema_short - ema_long
dea = cal_ema(dif, avg_t)
df['macd'] = (dif-dea)*2
return df
def cal_adx(df, N=14, M=6):
hd = df['high'].diff().dropna()
ld = -df['low'].diff().dropna()
dmp = pd.DataFrame({'dmp': [0] * len(hd)}, index=hd.index)
dmp[(hd > 0) & (ld < 0)] = hd
dmp = dmp.rolling(N).sum().dropna()
dmm = pd.DataFrame({'dmm': [0] * len(ld)}, index=ld.index)
dmm[(hd < 0) & (ld > 0)] = ld
dmm = dmm.rolling(N).sum().dropna()
temp = pd.concat([df['high'] - df['low'], abs(df['high'] - df['close'].shift(1)), \
abs(df['low'] - df['close'].shift(1))], axis=1).dropna()
tr = temp.max(axis=1).dropna()
s_index = dmm.index & tr.index & dmp.index
dmp = dmp.loc[s_index]
dmm = dmm.loc[s_index]
tr = tr.loc[s_index]
pdi = 100 * dmp['dmp'] / tr
mdi = dmm['dmm'] * 100 / tr
dx = abs(pdi - mdi) / (pdi + mdi) * 100
adx = dx.rolling(M).mean().dropna()
adx = pd.DataFrame(adx, columns=['adx'])
return adx
def handle_hs(df_hs):
df_hs = cal_dea(df_hs)
df_hs['adx'] = cal_adx(df_hs)
df_hs.dropna(inplace=True)
return df_hs
def handle_data(df_hs,df_marco):
df_hs = handle_hs(df_hs)
df_marco = df_marco.loc[df_hs.index]
df_hs = df_hs[df_hs.index.year>=2009]
df_marco = df_marco[df_marco.index.year>=2009]
#重新定义y 和 x
y = df_hs.loc[:,'chg']
# 找分位数
for i in range(1,6):
print( y.quantile(i/6))
print(i/6)
print("")
y_new = y.copy()
y_new[1.24 <= y_new] = 5
y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
y_new[y_new < -1.04] = 0
y_new.columns = ['label']
y_new_count = y_new.groupby(y_new).count()
y_new_count =pd.concat([ pd.Series(['<-1.04','-1.04~-0.37','-0.37~-0.06',\
'0.06~0.46','0.46~1.24','>1.24']),y_new_count],axis=1)
y_new_count.columns = ['分位数定义','num']
y_new_count.to_excel(save_path+'lstm的y分类.xls')
#去除相关性不高的因子
x = pd.concat([df_hs[['g','vol','ts','zhenfu','macd','adx']],df_marco],axis=1)
x = x.astype('float')
dic = {}
for i in x.columns:
temp = x[i]
dic[i] = temp.corr(y)
print(i)
df = pd.DataFrame.from_dict(dic,orient='index')
df.to_excel(save_path+'LSTM因子相关性.xls')
df_temp = df[abs(df)>0.01]
df_temp.dropna(inplace=True)
target = ['g','vol','ts','zhenfu','macd','adx','shibor','rf_10','dollar','aa','xinyong','cpi','m1_m2']
df_temp2 = df_temp.loc[target]
df_temp2.to_excel(save_path+'lstm筛选后因子相关性.xls')
x = x[target]
return x,y_new
def again_handle_data(x,y,df_backup):
def adf_test(x,y):
dic = {}
dic['y'] = sts.adfuller(y)[1]
for i in x.columns:
dic[i] = sts.adfuller(x[i])[1]
df = pd.DataFrame.from_dict(dic,orient='index')
target = list(df[df>0.01].dropna().index)
if len(target)>0:
print(target)
else:
print('所有因子通过单位根检验')
return df,target
def adf_data(x, y, df_backup):
df1, target1 = adf_test(x, y)
for i in target1:
x[i] = x[i].diff(1)
x.dropna(inplace=True)
y = y[x.index]
# 重新diff cpi 和 m1-m2,然后赋值给x
month_ = df_backup[['cpi','m1_m2']]
month_.dropna(inplace=True)
month_ = month_.shift(1)
month_[['cpi','m1_m2']] = month_[['cpi','m1_m2']].diff(1)
month_.dropna(inplace=True)
for i in month_.columns:
print(sts.adfuller(month_[i])[1])
a = list(x.columns)
a.remove('cpi')
a.remove('m1_m2')
b = x[a]
c = pd.concat([b,month_],axis=1)
x = c.fillna(method='bfill')
x = x[x.index.year>=2009]
x.dropna(inplace=True)
s_index = x.index & y.index
x = x.loc[s_index]
y = y[s_index]
# df2, target2 = adf_test(x, y)
# df1.to_excel(save_path + 'lstm单位根检验.xls')
# df2.to_excel(save_path + 'lstm差分后单位根检验.xls')
return x, y
x,y = adf_data(x,y,df_backup)
x_new = preprocessing.scale(x,axis=0)
x_new = pd.DataFrame(x_new,index=x.index,columns=x.columns)
mat = np.dot(x_new.T, x_new)
l,eig,r = np.linalg.svd(mat)
eig = eig / eig.sum()
eig = eig.cumsum()
i = 0
for e in eig:
i += 1
if e > 0.99:
n_pca = i
break
print('最大特征数目:' + str(n_pca))
# b = pd.DataFrame(y)
y_new = to_categorical(y)
y_new = pd.DataFrame(y_new,index=y.index)
return x_new,y_new
def pre_data(x,y,n_steps=2):
data = np.hstack([np.array(x),np.array(y)])
n_feature = x.shape[1]
train_pos = 0.9
result = []
for s in range(len(data)-n_steps):
temp = data[s:s+n_steps]
result.append(temp)
result = np.array(result)
row = round(train_pos * result.shape[0])
x_train = result[:row, :, :-6]
x_test = result[row:, :, :-6]
y_train = result[:row, -1, -6:]
y_test = result[row:, -1, -6:]
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], n_feature))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], n_feature))
return x_train, x_test, y_train, y_test
def bulid_model(x_train, neurons=[128,128,6], dropout=0.2):
model = Sequential()
model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(neurons[2], activation='softmax'))
adam = keras.optimizers.Adam(decay=0.2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
trainScore, testScore = model_score(model,x_train, y_train, x_test, y_test)
def run_model(x_train,y_train,save_path, neurons=[128,128,9], dropout=0.2):
model = bulid_model(x_train, neurons=[128,128,6], dropout=0.2)
history = model.fit(x_train,y_train,epochs=32,batch_size=64,verbose=2)
fig = plt.figure()
ax = fig.add_subplot(111)
pic1 = ax.plot(history.history['loss'], label='loss',color='b')
ax2 = ax.twinx()
pic2 = ax2.plot(history.history['accuracy'],label='acc',color='r')
pic = pic1+pic2
labs = [l.get_label() for l in pic]
ax.legend(pic,labs,loc=2)
plt.savefig(save_path+'LSTM_loss.jpg',dpi=300)
plt.show()
plt.close()
def model_score(model, x_train, y_train, x_test, y_test):
trainScore = model.evaluate(x_train, y_train, verbose=0)
print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], np.sqrt(trainScore[0])))
testScore = model.evaluate(x_test, y_test, verbose=0)
print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], np.sqrt(testScore[0])))
return trainScore, testScore
def predict_model(model,x_test,y_test,save_path,x,train_pos=0.9):
y_pred_prob = model.predict(x_test)
#y_pred = tran_porb(y_pred_prob)
y_class = model.predict_classes(x_test)
y_pred = to_categorical(y_class)
target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
print(classification_report(y_test, y_pred, target_names=target))
# target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
# print(classification_report(y_test, y_pred, target_names=target))
row = round(train_pos * x.shape[0])
s_index = x.index[row:]
y_pred_prob = pd.DataFrame(y_pred_prob,columns = target,index=s_index)
y_pred = pd.DataFrame(y_pred,columns=target,index=s_index)
y_test = pd.DataFrame(y_test,columns=target,index=s_index)
y_class = pd.DataFrame(y_class,index=s_index)
y_pred_prob.to_excel(save_path+'LSTM预测概率.xls')
y_pred.to_excel(save_path+'LSTM预测类别dummy.xls')
y_test.to_excel(save_path+'LSTM真实值类别.xls')
y_class.to_excel(save_path+'LSTM预测类别class.xls')
return y_pred_prob,y_pred
def tran_porb(y_pred_prob):
y_pred = np.zeros((y_pred_prob.shape[0],y_pred_prob.shape[1]))
x_max = y_pred_prob.argmax(axis=1)
for i in range(y_pred_prob.shape[0]):
y_pred[i,x_max[i]] = 1
return y_pred
def pic_lstm(df_hs,y_class):
y = df_hs['chg']
y_new = y.copy()
y_new[1.24 <= y_new] = 5
y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
y_new[y_new < -1.04] = 0
y_new = y_new[y_class.index]
df = pd.concat([y_new,y_class],axis=1)
df.columns = ['真实值','预测值']
df.to_excel(save_path+'预测汇总/lstm画图.xls')
fig = plt.figure(figsize=(12,8))
plt.plot(df['真实值'])
plt.plot(df['预测值'])
plt.show()
plt.close()
#%%
if __name__ == '__main__':
open_path = 'data1115/'
view_path = 'view_path/'
save_path = 'save_path/'
#中文字体
matplotlib.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['SimHei']
df_hs,df_marco,df_backup = read_data(open_path)
x,y = handle_data(df_hs,df_marco,)
x,y = again_handle_data(x,y,df_backup)
x_train, x_test, y_train, y_test = pre_data(x, y)
model = bulid_model(x_train)
model.save(save_path+'LSTM.H5')
run_model(x_train, y_train, save_path)
#trainScore, testScore = model_score(model, x_train, y_train, x_test, y_test)
y_pred_prob, y_pred = predict_model(model,x_test,y_test,save_path,x,train_pos=0.9)