神经网络学习:LSTM对沪深300收益率预测

2019-11-27  本文已影响0人  黄yy家的jby

1-导库

# 常见库
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import os
# 数据预处理,sts是为了adf检验,acf是用来看滞后几期比较合适
import statsmodels.tsa.stattools as sts
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import acf

# 构建LSTM 所需库
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from sklearn.metrics import classification_report

2-数据导入

df_hs : 沪深300的相关数据:收盘价,换手率,成交量,股息率,最大振幅
df_marco : 各宏观指标
考虑披露时效,指标滞后一期(季度滞后3个月,月度滞后一个月)

def read_data(open_path):
    df_hs = pd.read_excel(open_path+'hs300_zhenfu.xlsx',index_col=0)
    df_hs.columns = ['g','close','vol','ts','high','low']
    df_hs['zhenfu'] = 100*(df_hs['high']-df_hs['low'])/df_hs['close']
    df_hs['chg'] = 100*df_hs['close'].pct_change(1)
    df_hs.loc[:,df_hs.columns != 'chg'] = df_hs.loc[:,df_hs.columns != 'chg'].shift(1)
    df_hs.dropna(inplace=True,axis=0)

    df_marco = pd.read_excel(open_path+'宏观数据.xls',index_col=0)
    df_marco.columns = ['gdp','ppi','cpi','pmi','industry_add','investment','current_deposit',\
                        'fixed_deposit_3','fixed_deposit_6','mlf','slf','shibor','rf_1','rf_10',\
                        'm1','m2','inter_borrow','dollar','dollar_rmb','yen_rmb','social_financing',\
                        'foreign_reserve','au','rare','aa+','aa']
    df_marco['m1_m2'] = df_marco['m1'] - df_marco['m2']
    df_marco['qixian'] = df_marco['rf_10'] - df_marco['rf_1']
    df_marco['xinyong_+'] = df_marco['aa+'] - df_marco['rf_1']
    df_marco['xinyong'] = df_marco['aa'] - df_marco['rf_1']

    # 滞后一期,先resample到月,向后推一个月,在resample到天
    day_ = df_marco[['shibor','rf_1','rf_10','dollar','dollar_rmb','yen_rmb','au','aa+','aa','qixian','xinyong_+','xinyong']]
    month_ = df_marco[['ppi','cpi','pmi','industry_add','investment','current_deposit','fixed_deposit_3','fixed_deposit_6',\
                       'slf','m1','m2','m1_m2','inter_borrow','social_financing','rare']]
    quarter_ = df_marco[['gdp','foreign_reserve']]

    day_[['dollar','dollar_rmb','yen_rmb','au']] = 100*day_[['dollar','dollar_rmb','yen_rmb','au']].pct_change(1)
    day_ = day_.shift(1)

    month_ = month_.resample('M').mean()
    month_.fillna(method='ffill',inplace=True)
    month_['pmi'] = 100*month_['pmi'].pct_change(12)
    month_['social_financing'] = 100*month_['social_financing'].pct_change(1)
    month_ = month_.shift(1)
    month_ = month_.resample('d').pad()

    quarter_ = quarter_.resample('3M').mean()
    quarter_.fillna(method='ffill',inplace=True)
    quarter_['foreign_reserve'] = 100*quarter_['foreign_reserve'].pct_change(1)
    quarter_ = quarter_.shift(1)
    quarter_ = quarter_.resample('d').pad()

    df = pd.concat([day_,month_,quarter_],axis=1)
    df = df.fillna(method='ffill')
    df = df.loc[df_hs.index]
    return df_hs, df,df_marco

3- 数据整合

计算技术指标数据
定义x(宏观指标,资产配置指标,利率指标,外汇指标,量价指标,滞后项)和y(沪深300日收益率)
将y按照分位数进行6等分类

def cal_ema(df, N):
    a = 2/(N+1)
    b = pd.DataFrame(columns = ['close'], index=df.index)
    for i in range(len(df)):
        if i == 0:
            b.iloc[i] = df['close'].iloc[i]
        else:
            b.iloc[i] = a * df['close'].iloc[i] + (1-a) * b.iloc[i-1]
    return b

def cal_dea(df, short_t=12, long_t=26 ,avg_t=9):
    ema_short = cal_ema(df, short_t)
    ema_long = cal_ema(df, long_t)
    dif = ema_short - ema_long
    dea = cal_ema(dif, avg_t)
    df['macd'] = (dif-dea)*2
    return df

def cal_adx(df, N=14, M=6):
    hd = df['high'].diff().dropna()
    ld = -df['low'].diff().dropna()
    dmp = pd.DataFrame({'dmp': [0] * len(hd)}, index=hd.index)
    dmp[(hd > 0) & (ld < 0)] = hd
    dmp = dmp.rolling(N).sum().dropna()
    dmm = pd.DataFrame({'dmm': [0] * len(ld)}, index=ld.index)
    dmm[(hd < 0) & (ld > 0)] = ld
    dmm = dmm.rolling(N).sum().dropna()
    temp = pd.concat([df['high'] - df['low'], abs(df['high'] - df['close'].shift(1)), \
                      abs(df['low'] - df['close'].shift(1))], axis=1).dropna()
    tr = temp.max(axis=1).dropna()

    s_index = dmm.index & tr.index & dmp.index
    dmp = dmp.loc[s_index]
    dmm = dmm.loc[s_index]
    tr = tr.loc[s_index]
    pdi = 100 * dmp['dmp'] / tr
    mdi = dmm['dmm'] * 100 / tr

    dx = abs(pdi - mdi) / (pdi + mdi) * 100
    adx = dx.rolling(M).mean().dropna()
    adx = pd.DataFrame(adx, columns=['adx'])
    return adx

def handle_hs(df_hs):
    df_hs = cal_dea(df_hs)
    df_hs['adx'] = cal_adx(df_hs)
    df_hs.dropna(inplace=True)
    return df_hs


def handle_data(df_hs,df_marco):
    df_hs = handle_hs(df_hs)
    df_marco = df_marco.loc[df_hs.index]

    df_hs = df_hs[df_hs.index.year>=2009]
    df_marco = df_marco[df_marco.index.year>=2009]

    #重新定义y 和 x
    y = df_hs.loc[:,'chg']
    # 找分位数
    for i in range(1,6):
        print( y.quantile(i/6))
        print(i/6)
        print("")
    y_new = y.copy()
    y_new[1.24 <= y_new] = 5
    y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
    y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
    y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
    y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
    y_new[y_new < -1.04] = 0

    y_new.columns = ['label']
    y_new_count = y_new.groupby(y_new).count()
    y_new_count =pd.concat([ pd.Series(['<-1.04','-1.04~-0.37','-0.37~-0.06',\
                                   '0.06~0.46','0.46~1.24','>1.24']),y_new_count],axis=1)
    y_new_count.columns = ['分位数定义','num']
    y_new_count.to_excel(save_path+'lstm的y分类.xls')

    #去除相关性不高的因子
    x = pd.concat([df_hs[['g','vol','ts','zhenfu','macd','adx']],df_marco],axis=1)
    x = x.astype('float')
    dic = {}
    for i in x.columns:
        temp = x[i]
        dic[i] = temp.corr(y)
        print(i)
    df = pd.DataFrame.from_dict(dic,orient='index')
    df.to_excel(save_path+'LSTM因子相关性.xls')
    df_temp = df[abs(df)>0.01]
    df_temp.dropna(inplace=True)

    target = ['g','vol','ts','zhenfu','macd','adx','shibor','rf_10','dollar','aa','xinyong','cpi','m1_m2']
    df_temp2 = df_temp.loc[target]
    df_temp2.to_excel(save_path+'lstm筛选后因子相关性.xls')
    x = x[target]
    return x,y_new

4-检验数据

相关性过滤--单位根检验--PCA取主成因子(不存在共线性)

def again_handle_data(x,y,df_backup):
    def adf_test(x,y):
        dic = {}
        dic['y'] = sts.adfuller(y)[1]
        for i in x.columns:
            dic[i] = sts.adfuller(x[i])[1]
        df = pd.DataFrame.from_dict(dic,orient='index')
        target = list(df[df>0.01].dropna().index)
        if len(target)>0:
            print(target)
        else:
            print('所有因子通过单位根检验')
        return df,target

    def adf_data(x, y, df_backup):
        df1, target1 = adf_test(x, y)
        for i in target1:
            x[i] = x[i].diff(1)
        x.dropna(inplace=True)
        y = y[x.index]

        # 重新diff cpi 和 m1-m2,然后赋值给x
        month_ = df_backup[['cpi','m1_m2']]
        month_.dropna(inplace=True)
        month_ = month_.shift(1)
        month_[['cpi','m1_m2']] = month_[['cpi','m1_m2']].diff(1)
        month_.dropna(inplace=True)
        for i in month_.columns:
            print(sts.adfuller(month_[i])[1])

        a = list(x.columns)
        a.remove('cpi')
        a.remove('m1_m2')
        b = x[a]
        c = pd.concat([b,month_],axis=1)
        x = c.fillna(method='bfill')
        x = x[x.index.year>=2009]
        x.dropna(inplace=True)
        s_index = x.index & y.index
        x = x.loc[s_index]
        y = y[s_index]
        # df2, target2 = adf_test(x, y)
        # df1.to_excel(save_path + 'lstm单位根检验.xls')
        # df2.to_excel(save_path + 'lstm差分后单位根检验.xls')
        return x, y
    x,y = adf_data(x,y,df_backup)
    x_new = preprocessing.scale(x,axis=0)
    x_new = pd.DataFrame(x_new,index=x.index,columns=x.columns)
    mat = np.dot(x_new.T, x_new)
    l,eig,r = np.linalg.svd(mat)
    eig = eig / eig.sum()
    eig = eig.cumsum()

    i = 0
    for e in eig:
        i += 1
        if e > 0.99:
            n_pca = i
            break
    print('最大特征数目:' + str(n_pca))

    # b = pd.DataFrame(y)
    y_new = to_categorical(y)
    y_new = pd.DataFrame(y_new,index=y.index)
    return x_new,y_new

5-数据准备

lstm要求数据输入有特定格式,n_steps=2是根据acf得到

def pre_data(x,y,n_steps=2):
    data = np.hstack([np.array(x),np.array(y)])

    n_feature = x.shape[1]
    train_pos = 0.9

    result = []
    for s in range(len(data)-n_steps):
        temp = data[s:s+n_steps]
        result.append(temp)
    result = np.array(result)

    row = round(train_pos * result.shape[0])
    x_train = result[:row, :, :-6]
    x_test = result[row:, :, :-6]
    y_train = result[:row, -1, -6:]
    y_test = result[row:, -1, -6:]

    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], n_feature))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], n_feature))
    return x_train, x_test, y_train, y_test

6-模型建立

由于对数据有结构要求,要建立两层,需要return_sequences=True

def bulid_model(x_train, neurons=[128,128,6], dropout=0.2):
    model = Sequential()
    model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=False))
    model.add(Dropout(dropout))
    model.add(Dense(neurons[2], activation='softmax'))
    adam = keras.optimizers.Adam(decay=0.2)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

  # 建立两层模型
   # model = Sequential()
   # model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
   # model.add(Dropout(dropout))
   # model .add(LSTM(neurons[1]))
   # model.add(Dropout(dropout))
   # model.add(Dense(neurons[2], activation='softmax'))
   # adam = keras.optimizers.Adam(decay=0.2)
   # model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   # model.summary()

    return model

7-运行模型

def run_model(x_train,y_train,save_path, neurons=[128,128,9], dropout=0.2):
    model = bulid_model(x_train, neurons=[128,128,6], dropout=0.2)
    history = model.fit(x_train,y_train,epochs=32,batch_size=64,verbose=2)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    pic1 = ax.plot(history.history['loss'], label='loss',color='b')
    ax2 = ax.twinx()
    pic2 = ax2.plot(history.history['accuracy'],label='acc',color='r')
    pic = pic1+pic2
    labs = [l.get_label() for l in pic]
    ax.legend(pic,labs,loc=2)
    plt.savefig(save_path+'LSTM_loss.jpg',dpi=300)
    plt.show()
    plt.close()


def model_score(model, x_train, y_train, x_test, y_test):
    trainScore = model.evaluate(x_train, y_train, verbose=0)
    print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], np.sqrt(trainScore[0])))

    testScore = model.evaluate(x_test, y_test, verbose=0)
    print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], np.sqrt(testScore[0])))
    return trainScore, testScore        

8- 预测模型

def predict_model(model,x_test,y_test,save_path,x,train_pos=0.9):

    y_pred_prob = model.predict(x_test)
    #y_pred = tran_porb(y_pred_prob)
    y_class = model.predict_classes(x_test)
    y_pred = to_categorical(y_class)
    target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
    print(classification_report(y_test, y_pred, target_names=target))    

9-保存模型

读取模型中 直接
from keras.models 有可能会报错,查询了一下貌似是keras版本太高的问题
from tensorflow.keras.models 既可以了

model.save(save_path+'lstm.h5')

# 读取模型
#from tensorflow.keras.models import load_model

#model = load_model(save_path+'lstm.h5'

完整代码

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import os
import statsmodels.tsa.stattools as sts
from sklearn import preprocessing
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
# from statsmodels.tsa.api import VAR
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.tsa.stattools import acf
from arch import arch_model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from sklearn.metrics import classification_report



import ReadData as RD
import PreHandle as PH
import warnings

warnings.filterwarnings('ignore')

#%%
# 提取原始信息
def read_data(open_path):
    df_hs = pd.read_excel(open_path+'hs300_zhenfu.xlsx',index_col=0)
    df_hs.columns = ['g','close','vol','ts','high','low']
    df_hs['zhenfu'] = 100*(df_hs['high']-df_hs['low'])/df_hs['close']
    df_hs['chg'] = 100*df_hs['close'].pct_change(1)
    df_hs.loc[:,df_hs.columns != 'chg'] = df_hs.loc[:,df_hs.columns != 'chg'].shift(1)
    df_hs.dropna(inplace=True,axis=0)

    df_marco = pd.read_excel(open_path+'宏观数据.xls',index_col=0)
    df_marco.columns = ['gdp','ppi','cpi','pmi','industry_add','investment','current_deposit',\
                        'fixed_deposit_3','fixed_deposit_6','mlf','slf','shibor','rf_1','rf_10',\
                        'm1','m2','inter_borrow','dollar','dollar_rmb','yen_rmb','social_financing',\
                        'foreign_reserve','au','rare','aa+','aa']
    df_marco['m1_m2'] = df_marco['m1'] - df_marco['m2']
    df_marco['qixian'] = df_marco['rf_10'] - df_marco['rf_1']
    df_marco['xinyong_+'] = df_marco['aa+'] - df_marco['rf_1']
    df_marco['xinyong'] = df_marco['aa'] - df_marco['rf_1']

    # 滞后一期,先resample到月,向后推一个月,在resample到天
    day_ = df_marco[['shibor','rf_1','rf_10','dollar','dollar_rmb','yen_rmb','au','aa+','aa','qixian','xinyong_+','xinyong']]
    month_ = df_marco[['ppi','cpi','pmi','industry_add','investment','current_deposit','fixed_deposit_3','fixed_deposit_6',\
                       'slf','m1','m2','m1_m2','inter_borrow','social_financing','rare']]
    quarter_ = df_marco[['gdp','foreign_reserve']]

    #day_.dropna(inplace=True)
    # day_['dollar'] = day_['dollar'].pct_change(1)
    # day_['dollar_rmb'] = day_['dollar_rmb'].pct_change(1)
    # day_['yen_rmb'] = day_['yen_rmb'].pct_change(1)
    day_[['dollar','dollar_rmb','yen_rmb','au']] = 100*day_[['dollar','dollar_rmb','yen_rmb','au']].pct_change(1)
    day_ = day_.shift(1)
    #day_.dropna(inplace=True)

    month_ = month_.resample('M').mean()
    month_.fillna(method='ffill',inplace=True)
    month_['pmi'] = 100*month_['pmi'].pct_change(12)
    month_['social_financing'] = 100*month_['social_financing'].pct_change(1)
    month_ = month_.shift(1)
    month_ = month_.resample('d').pad()

    quarter_ = quarter_.resample('3M').mean()
    quarter_.fillna(method='ffill',inplace=True)
    quarter_['foreign_reserve'] = 100*quarter_['foreign_reserve'].pct_change(1)
    quarter_ = quarter_.shift(1)
    quarter_ = quarter_.resample('d').pad()

    df = pd.concat([day_,month_,quarter_],axis=1)
    df = df.fillna(method='ffill')
    df = df.loc[df_hs.index]
    return df_hs, df,df_marco


# 提取技术性指标
def cal_ema(df, N):
    a = 2/(N+1)
    b = pd.DataFrame(columns = ['close'], index=df.index)
    for i in range(len(df)):
        if i == 0:
            b.iloc[i] = df['close'].iloc[i]
        else:
            b.iloc[i] = a * df['close'].iloc[i] + (1-a) * b.iloc[i-1]
    return b

def cal_dea(df, short_t=12, long_t=26 ,avg_t=9):
    ema_short = cal_ema(df, short_t)
    ema_long = cal_ema(df, long_t)
    dif = ema_short - ema_long
    dea = cal_ema(dif, avg_t)
    df['macd'] = (dif-dea)*2
    return df

def cal_adx(df, N=14, M=6):
    hd = df['high'].diff().dropna()
    ld = -df['low'].diff().dropna()
    dmp = pd.DataFrame({'dmp': [0] * len(hd)}, index=hd.index)
    dmp[(hd > 0) & (ld < 0)] = hd
    dmp = dmp.rolling(N).sum().dropna()
    dmm = pd.DataFrame({'dmm': [0] * len(ld)}, index=ld.index)
    dmm[(hd < 0) & (ld > 0)] = ld
    dmm = dmm.rolling(N).sum().dropna()
    temp = pd.concat([df['high'] - df['low'], abs(df['high'] - df['close'].shift(1)), \
                      abs(df['low'] - df['close'].shift(1))], axis=1).dropna()
    tr = temp.max(axis=1).dropna()

    s_index = dmm.index & tr.index & dmp.index
    dmp = dmp.loc[s_index]
    dmm = dmm.loc[s_index]
    tr = tr.loc[s_index]
    pdi = 100 * dmp['dmp'] / tr
    mdi = dmm['dmm'] * 100 / tr

    dx = abs(pdi - mdi) / (pdi + mdi) * 100
    adx = dx.rolling(M).mean().dropna()
    adx = pd.DataFrame(adx, columns=['adx'])
    return adx

def handle_hs(df_hs):
    df_hs = cal_dea(df_hs)
    df_hs['adx'] = cal_adx(df_hs)
    df_hs.dropna(inplace=True)
    return df_hs



def handle_data(df_hs,df_marco):
    df_hs = handle_hs(df_hs)
    df_marco = df_marco.loc[df_hs.index]

    df_hs = df_hs[df_hs.index.year>=2009]
    df_marco = df_marco[df_marco.index.year>=2009]

    #重新定义y 和 x
    y = df_hs.loc[:,'chg']
    # 找分位数
    for i in range(1,6):
        print( y.quantile(i/6))
        print(i/6)
        print("")
    y_new = y.copy()
    y_new[1.24 <= y_new] = 5
    y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
    y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
    y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
    y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
    y_new[y_new < -1.04] = 0

    y_new.columns = ['label']
    y_new_count = y_new.groupby(y_new).count()
    y_new_count =pd.concat([ pd.Series(['<-1.04','-1.04~-0.37','-0.37~-0.06',\
                                   '0.06~0.46','0.46~1.24','>1.24']),y_new_count],axis=1)
    y_new_count.columns = ['分位数定义','num']
    y_new_count.to_excel(save_path+'lstm的y分类.xls')

    #去除相关性不高的因子
    x = pd.concat([df_hs[['g','vol','ts','zhenfu','macd','adx']],df_marco],axis=1)
    x = x.astype('float')
    dic = {}
    for i in x.columns:
        temp = x[i]
        dic[i] = temp.corr(y)
        print(i)
    df = pd.DataFrame.from_dict(dic,orient='index')
    df.to_excel(save_path+'LSTM因子相关性.xls')
    df_temp = df[abs(df)>0.01]
    df_temp.dropna(inplace=True)

    target = ['g','vol','ts','zhenfu','macd','adx','shibor','rf_10','dollar','aa','xinyong','cpi','m1_m2']
    df_temp2 = df_temp.loc[target]
    df_temp2.to_excel(save_path+'lstm筛选后因子相关性.xls')
    x = x[target]
    return x,y_new

def again_handle_data(x,y,df_backup):
    def adf_test(x,y):
        dic = {}
        dic['y'] = sts.adfuller(y)[1]
        for i in x.columns:
            dic[i] = sts.adfuller(x[i])[1]
        df = pd.DataFrame.from_dict(dic,orient='index')
        target = list(df[df>0.01].dropna().index)
        if len(target)>0:
            print(target)
        else:
            print('所有因子通过单位根检验')
        return df,target

    def adf_data(x, y, df_backup):
        df1, target1 = adf_test(x, y)
        for i in target1:
            x[i] = x[i].diff(1)
        x.dropna(inplace=True)
        y = y[x.index]

        # 重新diff cpi 和 m1-m2,然后赋值给x
        month_ = df_backup[['cpi','m1_m2']]
        month_.dropna(inplace=True)
        month_ = month_.shift(1)
        month_[['cpi','m1_m2']] = month_[['cpi','m1_m2']].diff(1)
        month_.dropna(inplace=True)
        for i in month_.columns:
            print(sts.adfuller(month_[i])[1])

        a = list(x.columns)
        a.remove('cpi')
        a.remove('m1_m2')
        b = x[a]
        c = pd.concat([b,month_],axis=1)
        x = c.fillna(method='bfill')
        x = x[x.index.year>=2009]
        x.dropna(inplace=True)
        s_index = x.index & y.index
        x = x.loc[s_index]
        y = y[s_index]
        # df2, target2 = adf_test(x, y)
        # df1.to_excel(save_path + 'lstm单位根检验.xls')
        # df2.to_excel(save_path + 'lstm差分后单位根检验.xls')
        return x, y
    x,y = adf_data(x,y,df_backup)
    x_new = preprocessing.scale(x,axis=0)
    x_new = pd.DataFrame(x_new,index=x.index,columns=x.columns)
    mat = np.dot(x_new.T, x_new)
    l,eig,r = np.linalg.svd(mat)
    eig = eig / eig.sum()
    eig = eig.cumsum()

    i = 0
    for e in eig:
        i += 1
        if e > 0.99:
            n_pca = i
            break
    print('最大特征数目:' + str(n_pca))

    # b = pd.DataFrame(y)
    y_new = to_categorical(y)
    y_new = pd.DataFrame(y_new,index=y.index)
    return x_new,y_new

def pre_data(x,y,n_steps=2):
    data = np.hstack([np.array(x),np.array(y)])

    n_feature = x.shape[1]
    train_pos = 0.9

    result = []
    for s in range(len(data)-n_steps):
        temp = data[s:s+n_steps]
        result.append(temp)
    result = np.array(result)

    row = round(train_pos * result.shape[0])
    x_train = result[:row, :, :-6]
    x_test = result[row:, :, :-6]
    y_train = result[:row, -1, -6:]
    y_test = result[row:, -1, -6:]

    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], n_feature))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], n_feature))
    return x_train, x_test, y_train, y_test

def bulid_model(x_train, neurons=[128,128,6], dropout=0.2):
    model = Sequential()
    model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=False))
    model.add(Dropout(dropout))
    model.add(Dense(neurons[2], activation='softmax'))
    adam = keras.optimizers.Adam(decay=0.2)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

    trainScore, testScore = model_score(model,x_train, y_train, x_test, y_test)

def run_model(x_train,y_train,save_path, neurons=[128,128,9], dropout=0.2):
    model = bulid_model(x_train, neurons=[128,128,6], dropout=0.2)
    history = model.fit(x_train,y_train,epochs=32,batch_size=64,verbose=2)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    pic1 = ax.plot(history.history['loss'], label='loss',color='b')
    ax2 = ax.twinx()
    pic2 = ax2.plot(history.history['accuracy'],label='acc',color='r')
    pic = pic1+pic2
    labs = [l.get_label() for l in pic]
    ax.legend(pic,labs,loc=2)
    plt.savefig(save_path+'LSTM_loss.jpg',dpi=300)
    plt.show()
    plt.close()


def model_score(model, x_train, y_train, x_test, y_test):
    trainScore = model.evaluate(x_train, y_train, verbose=0)
    print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], np.sqrt(trainScore[0])))

    testScore = model.evaluate(x_test, y_test, verbose=0)
    print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], np.sqrt(testScore[0])))
    return trainScore, testScore

def predict_model(model,x_test,y_test,save_path,x,train_pos=0.9):

    y_pred_prob = model.predict(x_test)
    #y_pred = tran_porb(y_pred_prob)
    y_class = model.predict_classes(x_test)
    y_pred = to_categorical(y_class)
    target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
    print(classification_report(y_test, y_pred, target_names=target))



    # target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
    # print(classification_report(y_test, y_pred, target_names=target))
    row = round(train_pos * x.shape[0])
    s_index = x.index[row:]
    y_pred_prob = pd.DataFrame(y_pred_prob,columns = target,index=s_index)
    y_pred = pd.DataFrame(y_pred,columns=target,index=s_index)
    y_test = pd.DataFrame(y_test,columns=target,index=s_index)
    y_class = pd.DataFrame(y_class,index=s_index)
    y_pred_prob.to_excel(save_path+'LSTM预测概率.xls')
    y_pred.to_excel(save_path+'LSTM预测类别dummy.xls')
    y_test.to_excel(save_path+'LSTM真实值类别.xls')
    y_class.to_excel(save_path+'LSTM预测类别class.xls')
    return y_pred_prob,y_pred

def tran_porb(y_pred_prob):
    y_pred = np.zeros((y_pred_prob.shape[0],y_pred_prob.shape[1]))
    x_max = y_pred_prob.argmax(axis=1)
    for i in range(y_pred_prob.shape[0]):
        y_pred[i,x_max[i]] = 1
    return y_pred


def pic_lstm(df_hs,y_class):
    y = df_hs['chg']
    y_new = y.copy()
    y_new[1.24 <= y_new] = 5
    y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
    y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
    y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
    y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
    y_new[y_new < -1.04] = 0
    y_new = y_new[y_class.index]

    df = pd.concat([y_new,y_class],axis=1)
    df.columns = ['真实值','预测值']
    df.to_excel(save_path+'预测汇总/lstm画图.xls')

    fig = plt.figure(figsize=(12,8))
    plt.plot(df['真实值'])
    plt.plot(df['预测值'])
    plt.show()
    plt.close()




#%%
if __name__ == '__main__':
    open_path = 'data1115/'
    view_path = 'view_path/'
    save_path = 'save_path/'

    #中文字体
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.rcParams['font.sans-serif'] = ['SimHei']

    df_hs,df_marco,df_backup = read_data(open_path)
    x,y = handle_data(df_hs,df_marco,)
    x,y = again_handle_data(x,y,df_backup)

    x_train, x_test, y_train, y_test = pre_data(x, y)


    model = bulid_model(x_train)
    model.save(save_path+'LSTM.H5')
    run_model(x_train, y_train, save_path)

    #trainScore, testScore = model_score(model, x_train, y_train, x_test, y_test)
    y_pred_prob, y_pred = predict_model(model,x_test,y_test,save_path,x,train_pos=0.9)

上一篇 下一篇

猜你喜欢

热点阅读