天池：O2O优惠券使用预测

2018-08-28 本文已影响164人 scottzcw

#coding=utf-8

import os

import pandasas pd

import numpyas np

import sys

from datetimeimport datetime,date

from stringimport Template

from dateutil.parserimport parse

from sklearnimport linear_model

from sklearn.linear_modelimport Ridge

from sklearn.metricsimport mean_squared_error, r2_score,roc_auc_score

from sklearnimport preprocessing

from sklearn.treeimport DecisionTreeRegressor

from sklearnimport ensemble,svm

reload(sys)

sys.setdefaultencoding("utf-8")

def getDiscountType(row):

if pd.isnull(row):

return 0

elif ':' in row:

return 0

else:

return 1

def convertRate(row):

"""Convert discount to rate"""

if pd.isnull(row):

return 1.0

elif ':' in str(row):

rows = row.split(':')

return 1.0 -float(rows[1]) /float(rows[0])

else:

return float(row)

def getDiscountMan(row):

if ':' in str(row):

rows = row.split(':')

return int(rows[0])

else:

return 0

def getDiscountJian(row):

if ':' in str(row):

rows = row.split(':')

return int(rows[1])

else:

return 0

def getWeekday(row):

if row =='nan':

return np.nan

else:

return date(int(row[0:4]),int(row[4:6]),int(row[6:8])).weekday() +1

pd.set_option('display.max_columns',None)

#设置目录

dir=r"D:\zcw\tianchi"

#加载训练数据

train_df=pd.read_csv(dir+"\ccf_offline_stage1_train.csv",sep=',',delimiter=',',

dtype= {'User_id':str,'Date':str,'Coupon_id':str,'Date_received':str,'Discount_rate':str,'Distance':str})

#加载线上数据

train_ol_df=pd.read_csv(dir+"\ccf_online_stage1_train.csv",sep=',',delimiter=',',

dtype= {'User_id':str,'Coupon_id':str,'Action':str,'Date':str} )

#优惠券使用率

train_ol_df_all=train_ol_df.groupby('Coupon_id',as_index=False)['User_id'].count().copy()

train_ol_df_cons=train_ol_df[(train_ol_df['Date']>'2016' )].groupby('Coupon_id',as_index=False)['User_id'].count().copy()

train_ol=pd.merge(train_ol_df_all,train_ol_df_cons,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

train_ol['cratio']=train_ol.apply(lambda row: row['User_id_y']/row['User_id_x'],axis=1)

print train_ol.head(10)

#计算折扣率

train_df['ratio']=train_df['Discount_rate'].map(lambda x:float(str(x).split(':')[1])/float(str(x).split(':')[0])if ':' in str(x)else 0 )

#计算使用优惠券

start=pd.to_datetime(train_df['Date'])

end=pd.to_datetime(train_df['Date_received'])

days=start-end

train_df['days']=days.dt.days

#训练数据增加优惠券使用率

train_df=pd.merge(train_df,train_ol,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

train_df['cratio']=train_df['cratio'].map(lambda x: xif x>0 else 0 )

#缺失补0

train_df['days']=train_df['days'].map(lambda x: xif x>0 else 0 )

#添加标签

train_df['label']=train_df['days'].map(lambda x:'1' if x>0 and x<=15 else '0' )

#距离补0

train_df['Distance']=train_df['Distance'].map(lambda x: xif x>0 else 0 )

# print train_df.head(10)

train_df['discount_man'] = train_df['Discount_rate'].apply(getDiscountMan)

train_df['discount_jian'] = train_df['Discount_rate'].apply(getDiscountJian)

train_df['discount_type'] = train_df['Discount_rate'].apply(getDiscountType)

train_df['weekday'] = train_df['Date_received'].astype(str).apply(getWeekday)

train_df["p1"] = np.array(train_df['weekday'] ==1.0).astype(np.int32)

train_df["p2"] = np.array(train_df['weekday'] ==2.0).astype(np.int32)

train_df["p3"] = np.array(train_df['weekday'] ==3.0).astype(np.int32)

train_df["p4"] = np.array(train_df['weekday'] ==4.0).astype(np.int32)

train_df["p5"] = np.array(train_df['weekday'] ==5.0).astype(np.int32)

train_df["p6"] = np.array(train_df['weekday'] ==6.0).astype(np.int32)

train_df["p7"] = np.array(train_df['weekday'] ==7.0).astype(np.int32)

# dftest2=dftest[['p1','p2','p3','p4','p5','p6','p7']]

del train_df['Date']

del train_df['Date_received']

del train_df['Discount_rate']

# train_df=pd.merge(train_df,train_ol_df, how='left', left_on=['User_id','Coupon_id'], right_on=['User_id','Coupon_id'])

print train_df.head(10)

#切分数据 1百万作为训练数据并且归一化处理

diabetes_X_train = preprocessing.scale(train_df.loc[:900000,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

diabetes_X_test = preprocessing.scale(train_df.loc[900000:,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

diabetes_y_train = train_df.loc[:900000,['label']]

diabetes_y_test = train_df.loc[900000:,['label']]

regr = linear_model.LinearRegression()

# regr =svm.SVR(C=1000,)

# regr =DecisionTreeRegressor() #0.04779884855102401

# regr= linear_model.LogisticRegression(); #0.97

# regr=ensemble.RandomForestRegressor(n_estimators=20,oob_score=True) # 0.04778887161750578

# regr=ensemble.AdaBoostRegressor(n_estimators=50) #0.04079561330108894

# regr=ensemble.AdaBoostClassifier(n_estimators=50)

# Train the model using the training sets

regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set

diabetes_y_pred = regr.predict(diabetes_X_test)

# diabetes_y_pred=diabetes_y_pred/0.2

# The coefficients

# print('score: \n', roc_auc_score(diabetes_y_test, diabetes_y_pred))

# print('clf: \n', regr.coef_)

#加载预测数据

test_df=pd.read_csv(dir+"\ccf_offline_stage1_test_revised.csv",sep=',',delimiter=',',

dtype= {'Date':str,'Coupon_id':str,'Date_received':str,'Distance':str})

test_df['ratio']=test_df['Discount_rate'].map(lambda x:float(str(x).split(':')[1])/float(str(x).split(':')[0])if ':' in str(x)else 0 )

test_df['Distance']=test_df['Distance'].map(lambda x: xif x>0 else 0 )

test_df=pd.merge(test_df,train_ol,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

test_df['cratio']=test_df['cratio'].map(lambda x: xif x>0 else 0 )

test_df['discount_man'] = test_df['Discount_rate'].apply(getDiscountMan)

test_df['discount_jian'] = test_df['Discount_rate'].apply(getDiscountJian)

test_df['discount_type'] = test_df['Discount_rate'].apply(getDiscountType)

test_df['weekday'] = test_df['Date_received'].astype(str).apply(getWeekday)

test_df["p1"] = np.array(test_df['weekday'] ==1.0).astype(np.int32)

test_df["p2"] = np.array(test_df['weekday'] ==2.0).astype(np.int32)

test_df["p3"] = np.array(test_df['weekday'] ==3.0).astype(np.int32)

test_df["p4"] = np.array(test_df['weekday'] ==4.0).astype(np.int32)

test_df["p5"] = np.array(test_df['weekday'] ==5.0).astype(np.int32)

test_df["p6"] = np.array(test_df['weekday'] ==6.0).astype(np.int32)

test_df["p7"] = np.array(test_df['weekday'] ==7.0).astype(np.int32)

p_X= preprocessing.scale(test_df.loc[:,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

#预测

p_Y = regr.predict(p_X)

# print p_Y[1:10]

test_df['Probability']=p_Y

# print test_df.head(10)

result_df=test_df[['User_id','Coupon_id','Date_received','Probability']].copy()

result_df['Probability']=result_df['Probability'].map(lambda x: xif x>0 else 0 )

print result_df.head(10)

#保存

result_df.to_csv(dir+"\\result.csv",sep=',',header=False,index=False)

天池：O2O优惠券使用预测

猜你喜欢

热点阅读