天池:O2O优惠券使用预测
#coding=utf-8
import os
import pandasas pd
import numpyas np
import sys
from datetimeimport datetime,date
from stringimport Template
from dateutil.parserimport parse
from sklearnimport linear_model
from sklearn.linear_modelimport Ridge
from sklearn.metricsimport mean_squared_error, r2_score,roc_auc_score
from sklearnimport preprocessing
from sklearn.treeimport DecisionTreeRegressor
from sklearnimport ensemble,svm
reload(sys)
sys.setdefaultencoding("utf-8")
def getDiscountType(row):
if pd.isnull(row):
return 0
elif ':' in row:
return 0
else:
return 1
def convertRate(row):
"""Convert discount to rate"""
if pd.isnull(row):
return 1.0
elif ':' in str(row):
rows = row.split(':')
return 1.0 -float(rows[1]) /float(rows[0])
else:
return float(row)
def getDiscountMan(row):
if ':' in str(row):
rows = row.split(':')
return int(rows[0])
else:
return 0
def getDiscountJian(row):
if ':' in str(row):
rows = row.split(':')
return int(rows[1])
else:
return 0
def getWeekday(row):
if row =='nan':
return np.nan
else:
return date(int(row[0:4]),int(row[4:6]),int(row[6:8])).weekday() +1
pd.set_option('display.max_columns',None)
#设置目录
dir=r"D:\zcw\tianchi"
#加载训练数据
train_df=pd.read_csv(dir+"\ccf_offline_stage1_train.csv",sep=',',delimiter=',',
dtype= {'User_id':str,'Date':str,'Coupon_id':str,'Date_received':str,'Discount_rate':str,'Distance':str})
#加载线上数据
train_ol_df=pd.read_csv(dir+"\ccf_online_stage1_train.csv",sep=',',delimiter=',',
dtype= {'User_id':str,'Coupon_id':str,'Action':str,'Date':str} )
#优惠券使用率
train_ol_df_all=train_ol_df.groupby('Coupon_id',as_index=False)['User_id'].count().copy()
train_ol_df_cons=train_ol_df[(train_ol_df['Date']>'2016' )].groupby('Coupon_id',as_index=False)['User_id'].count().copy()
train_ol=pd.merge(train_ol_df_all,train_ol_df_cons,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])
train_ol['cratio']=train_ol.apply(lambda row: row['User_id_y']/row['User_id_x'],axis=1)
print train_ol.head(10)
#计算折扣率
train_df['ratio']=train_df['Discount_rate'].map(lambda x:float(str(x).split(':')[1])/float(str(x).split(':')[0])if ':' in str(x)else 0 )
#计算使用优惠券
start=pd.to_datetime(train_df['Date'])
end=pd.to_datetime(train_df['Date_received'])
days=start-end
train_df['days']=days.dt.days
#训练数据增加优惠券使用率
train_df=pd.merge(train_df,train_ol,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])
train_df['cratio']=train_df['cratio'].map(lambda x: xif x>0 else 0 )
#缺失补0
train_df['days']=train_df['days'].map(lambda x: xif x>0 else 0 )
#添加标签
train_df['label']=train_df['days'].map(lambda x:'1' if x>0 and x<=15 else '0' )
#距离补0
train_df['Distance']=train_df['Distance'].map(lambda x: xif x>0 else 0 )
# print train_df.head(10)
train_df['discount_man'] = train_df['Discount_rate'].apply(getDiscountMan)
train_df['discount_jian'] = train_df['Discount_rate'].apply(getDiscountJian)
train_df['discount_type'] = train_df['Discount_rate'].apply(getDiscountType)
train_df['weekday'] = train_df['Date_received'].astype(str).apply(getWeekday)
train_df["p1"] = np.array(train_df['weekday'] ==1.0).astype(np.int32)
train_df["p2"] = np.array(train_df['weekday'] ==2.0).astype(np.int32)
train_df["p3"] = np.array(train_df['weekday'] ==3.0).astype(np.int32)
train_df["p4"] = np.array(train_df['weekday'] ==4.0).astype(np.int32)
train_df["p5"] = np.array(train_df['weekday'] ==5.0).astype(np.int32)
train_df["p6"] = np.array(train_df['weekday'] ==6.0).astype(np.int32)
train_df["p7"] = np.array(train_df['weekday'] ==7.0).astype(np.int32)
# dftest2=dftest[['p1','p2','p3','p4','p5','p6','p7']]
del train_df['Date']
del train_df['Date_received']
del train_df['Discount_rate']
# train_df=pd.merge(train_df,train_ol_df, how='left', left_on=['User_id','Coupon_id'], right_on=['User_id','Coupon_id'])
print train_df.head(10)
#切分数据 1百万作为训练数据 并且归一化处理
diabetes_X_train = preprocessing.scale(train_df.loc[:900000,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])
diabetes_X_test = preprocessing.scale(train_df.loc[900000:,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])
diabetes_y_train = train_df.loc[:900000,['label']]
diabetes_y_test = train_df.loc[900000:,['label']]
regr = linear_model.LinearRegression()
# regr =svm.SVR(C=1000,)
# regr =DecisionTreeRegressor() #0.04779884855102401
# regr= linear_model.LogisticRegression(); #0.97
# regr=ensemble.RandomForestRegressor(n_estimators=20,oob_score=True) # 0.04778887161750578
# regr=ensemble.AdaBoostRegressor(n_estimators=50) #0.04079561330108894
# regr=ensemble.AdaBoostClassifier(n_estimators=50)
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)
# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)
# diabetes_y_pred=diabetes_y_pred/0.2
# The coefficients
# print('score: \n', roc_auc_score(diabetes_y_test, diabetes_y_pred))
# print('clf: \n', regr.coef_)
#加载预测数据
test_df=pd.read_csv(dir+"\ccf_offline_stage1_test_revised.csv",sep=',',delimiter=',',
dtype= {'Date':str,'Coupon_id':str,'Date_received':str,'Distance':str})
test_df['ratio']=test_df['Discount_rate'].map(lambda x:float(str(x).split(':')[1])/float(str(x).split(':')[0])if ':' in str(x)else 0 )
test_df['Distance']=test_df['Distance'].map(lambda x: xif x>0 else 0 )
test_df=pd.merge(test_df,train_ol,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])
test_df['cratio']=test_df['cratio'].map(lambda x: xif x>0 else 0 )
test_df['discount_man'] = test_df['Discount_rate'].apply(getDiscountMan)
test_df['discount_jian'] = test_df['Discount_rate'].apply(getDiscountJian)
test_df['discount_type'] = test_df['Discount_rate'].apply(getDiscountType)
test_df['weekday'] = test_df['Date_received'].astype(str).apply(getWeekday)
test_df["p1"] = np.array(test_df['weekday'] ==1.0).astype(np.int32)
test_df["p2"] = np.array(test_df['weekday'] ==2.0).astype(np.int32)
test_df["p3"] = np.array(test_df['weekday'] ==3.0).astype(np.int32)
test_df["p4"] = np.array(test_df['weekday'] ==4.0).astype(np.int32)
test_df["p5"] = np.array(test_df['weekday'] ==5.0).astype(np.int32)
test_df["p6"] = np.array(test_df['weekday'] ==6.0).astype(np.int32)
test_df["p7"] = np.array(test_df['weekday'] ==7.0).astype(np.int32)
p_X= preprocessing.scale(test_df.loc[:,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])
#预测
p_Y = regr.predict(p_X)
# print p_Y[1:10]
test_df['Probability']=p_Y
# print test_df.head(10)
result_df=test_df[['User_id','Coupon_id','Date_received','Probability']].copy()
result_df['Probability']=result_df['Probability'].map(lambda x: xif x>0 else 0 )
print result_df.head(10)
#保存
result_df.to_csv(dir+"\\result.csv",sep=',',header=False,index=False)