Expedia数据挖掘(Kaggle比赛)
2018-09-08 本文已影响0人
SJTU_JORY
1.使用了SGDLR和Random Forest俩种方法
2.结合data leakage
3.最终得分49.999,在kaggle排行榜中能排到104位(共1700多队)
1.leakage solution
# -*- coding: utf-8 -*-
from heapq import nlargest
from operator import itemgetter
def leakage_deal():
f=open("train.csv", "r")
f.readline()
best_hotels_odd_ulc={}
best_hotels_miss_odd={}
best_h00={}
best_h01={}
count=0
#counts
while 1:
line=f.readline().strip()
count+=1
if line == '':
break
arr=line.split(",")
book_year=int(arr[0][:4])
book_month=int(arr[0][5:7])
user_location_city=arr[5]
orig_destination_distance=arr[6]
user_id=arr[7]
srch_destination_id=arr[16]
hotel_country=arr[21]
hotel_market=arr[22]
is_booking=float(arr[18])
hotel_cluster=arr[23]
relative_ref_month=((book_year-2012)*12+(book_month-12))
append_weight=relative_ref_month*relative_ref_month*(3+17.60*is_booking)
if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '' and hotel_country!= '':
s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
if s00 in best_h00:
if hotel_cluster in best_h00[s00]:
best_h00[s00][hotel_cluster] += append_weight
else:
best_h00[s00][hotel_cluster] = append_weight
else:
best_h00[s00] = {}
best_h00[s00][hotel_cluster] = append_weight
if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '':
s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
if s01 in best_h01:
if hotel_cluster in best_h01[s01]:
best_h01[s01][hotel_cluster]+=append_weight
else:
best_h01[s01][hotel_cluster]=append_weight
else:
best_h01[s01]={}
best_h01[s01][hotel_cluster]=append_weight
if user_location_city!= '' and orig_destination_distance=='' and user_id!='' and srch_destination_id!='' and hotel_country!='':
s0 = hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
if s0 in best_hotels_miss_odd:
if hotel_cluster in best_hotels_miss_odd[s0]:
best_hotels_miss_odd[s0][hotel_cluster]+=append_weight
else:
best_hotels_miss_odd[s0][hotel_cluster]=append_weight
else:
best_hotels_miss_odd[s0]={}
best_hotels_miss_odd[s0][hotel_cluster]=append_weight
if user_location_city!='' and orig_destination_distance!='':
s1 = hash(str(user_location_city)+':'+str(orig_destination_distance))
if s1 in best_hotels_odd_ulc:
if hotel_cluster in best_hotels_odd_ulc[s1]:
best_hotels_odd_ulc[s1][hotel_cluster]+=relative_ref_month
else:
best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month
else:
best_hotels_odd_ulc[s1]={}
best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month
f.close()
return best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd
def submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd):
path='leakage_deal.csv'
out=open(path, "w")
f=open("test.csv", "r")
f.readline()
count=0
count0=0
count00=0
count1=0
out.write("id,hotel_cluster\n")
while 1:
line=f.readline().strip()
count+=1
if count % 100000 == 0:
print('Write {} lines...'.format(count))
if line == '':
break
arr=line.split(",")
id=arr[0]
user_location_city=arr[6]
orig_destination_distance=arr[7]
user_id=arr[8]
srch_destination_id=arr[17]
hotel_country=arr[20]
hotel_market=arr[21]
out.write(str(id) + ',')
filled=[]
s1=hash(str(user_location_city)+':'+str(orig_destination_distance))
if s1 in best_hotels_odd_ulc:
d=best_hotels_odd_ulc[s1]
topitems=nlargest(5, sorted(d.items()), key=itemgetter(1))
for i in range(len(topitems)):
if topitems[i][0] in filled:
continue
if len(filled) == 5:
break
out.write(' ' + topitems[i][0])
filled.append(topitems[i][0])
count1 += 1
if orig_destination_distance == '':
s0=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
if s0 in best_hotels_miss_odd:
d=best_hotels_miss_odd[s0]
topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
for i in range(len(topitems)):
if topitems[i][0] in filled:
continue
if len(filled) == 5:
break
out.write(' ' + topitems[i][0])
filled.append(topitems[i][0])
count0+=1
s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
if s01 in best_h01 and s00 not in best_h00:
d=best_h01[s01]
topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
for i in range(len(topitems)):
if topitems[i][0] in filled:
continue
if len(filled) == 5:
break
out.write(' ' + topitems[i][0])
filled.append(topitems[i][0])
count00 += 1
out.write("\n")
out.close()
print('count 1=',count1)
print('count 0=',count0)
print('count 00=',count00)
best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd = leakage_deal()
submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd)
2.Random Forest
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import h5py
from sklearn.ensemble import RandomForestClassifier
def pre_deal(data):
'''data_pre_deal'''
try:
data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
data['srch_ci'] = data.srch_ci.astype(np.datetime64)
data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
data['date_time'] = data.date_time.astype(np.datetime64)
except:
pass
data.fillna(0, inplace=True)
#calculate the duration in hotel
data['live_in_days'] = data.srch_co-data.srch_ci
data['live_in_days'] = data['live_in_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
#calculate the time from book to live in the hotel
data['date_to_live_days'] = data.srch_ci-data.date_time
data['date_to_live_days'] = data['date_to_live_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
data['ci_day'] = data['srch_ci'].apply(lambda dt: dt.day)
data['date_month'] = data['date_time'].apply(lambda dt: dt.month)
data['date_day'] = data['date_time'].apply(lambda dt: dt.day)
data['date_hour'] = data['date_time'].apply(lambda dt: dt.hour)
data.drop(['date_time', 'user_id', 'srch_ci', 'srch_co'], axis=1, inplace=True)
import os
if os.path.exists('srch_dest_hc_hm_agg.csv'):
agg1 = pd.read_csv('srch_dest_hc_hm_agg.csv')
else:
reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=200000)
pieces = [chunk.groupby(['srch_destination_id','hotel_country','hotel_market','hotel_cluster'])['is_booking'].agg(['sum','count']) for chunk in reader]
agg = pd.concat(pieces).groupby(level=[0,1,2,3]).sum()
agg.dropna(inplace=True)
agg['sum_and_cnt'] = 0.85*agg['sum'] + 0.15*agg['count']
agg = agg.groupby(level=[0,1,2]).apply(lambda x: x.astype(float)/x.sum())
agg.reset_index(inplace=True)
agg1 = agg.pivot_table(index=['srch_destination_id','hotel_country','hotel_market'], columns='hotel_cluster', values='sum_and_cnt').reset_index()
agg1.to_csv('srch_dest_hc_hm_agg.csv', index=False)
#clean memory
del pieces,agg
destinations = pd.read_csv('destinations.csv')
submission = pd.read_csv('sample_submission.csv')
clf=RandomForestClassifier(n_estimators=100, n_jobs=-1, warm_start=True)
count=0
chunksize=200000
reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
try:
chunk = chunk[chunk.is_booking==1]
chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
pre_deal(chunk)
y = chunk.hotel_cluster
chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
if len(y.unique()) == 100:
clf.set_params(n_estimators=clf.n_estimators+1)
clf.fit(chunk, y)
count = count + chunksize
print(count,' have done')
if(count/chunksize == 300):
break
except Exception as e:
print(str(e))
pass
count = 0
chunksize = 10000
preds = np.empty((submission.shape[0],clf.n_classes_))
reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
chunk.drop(['id'], axis=1, inplace=True)
pre_deal(chunk)
pred = clf.predict_proba(chunk)
preds[count:(count + chunk.shape[0]),:] = pred
count = count + chunksize
print(count,' have done')
del clf,agg1
if os.path.exists('rf.h5'):
with h5py.File('rf.h5', 'r+') as hf:
predslatesthf = hf['preds_latest']
preds += predslatesthf.value
predslatesthf[...] = preds
else:
with h5py.File('rf.h5', 'w') as hf:
hf.create_dataset('preds_latest', data=preds)
fea_ind = np.argsort(-preds, axis=1)[:,:5]
happend = [' '.join(row.astype(str)) for row in fea_ind]
submit = pd.DataFrame(data=happend, index=submission.id)
submit.reset_index(inplace=True)
submit.columns = submission.columns
submit.to_csv('rf_deal.csv', index=False)
3.SGDLR
# -*- coding: utf-8 -*-
import pandas as pd
from scipy.sparse import csr_matrix, hstack
import numpy as np
import h5py
import pickle
from sklearn.linear_model import SGDClassifier
import os
cat_col = ['user_id','user_location_city','srch_destination_id','srch_destination_type_id',
'hotel_continent','hotel_country', 'hotel_market']
num_col = ['is_mobile', 'is_package']
def bin_time(t):
if t < 0:
x = 0
elif t < 2:
x = 1
elif t < 7:
x = 2
elif t < 30:
x = 3
else:
x = 4
return x
def pre_process(data):
try:
data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
data['srch_ci'] = data.srch_ci.astype(np.datetime64)
data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
data['date_time'] = data.date_time.astype(np.datetime64)
except:
pass
data.fillna(0, inplace=True)
data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
data['season_dest'] = 'season_dest' + data.ci_month.map(str) + '*' + data.srch_destination_id.map(str)
data['season_dest'] = data['season_dest'].map(hash)
data['date_to_live_days'] = data.srch_ci-data.date_time
data['date_to_live_days'] = data['date_to_live_days'].apply(lambda td: td/np.timedelta64(1, 'D'))
data['date_to_live_days'] = data['date_to_live_days'].map(bin_time)
data['time_dest'] = 'time_dest' + data.date_to_live_days.map(str) + '*' + data.srch_destination_id.map(str)
data['time_dest'] = data['time_dest'].map(hash)
for col in cat_col:
data[col] = col + data[col].map(str)
data[col] = data[col].map(hash)
submission = pd.read_csv('sample_submission.csv')
cat_col_all = cat_col + ['season_dest', 'time_dest']
def map5eval(preds, actual):
'''evaluate standard'''
predicted = preds.argsort(axis=1)[:,-np.arange(5)]
metric = 0.
for i in range(5):
metric += np.sum(actual==predicted[:,i])/(i+1)
metric /= actual.shape[0]
return metric
if os.path.exists('sgd.pkl'):
with open('sgd.pkl', 'rb') as f:
clf = pickle.load(f)
else:
clf = SGDClassifier(loss='log', alpha=0.0000025, verbose=0)
#clf.sparsify()
for epoch in range(5):
count = 0
chunksize = 200000
n_features = 3000000
print('Epoch: ', epoch)
reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
try:
pre_process(chunk)
y = chunk.hotel_cluster
sw = 1 + 4*chunk.is_booking
chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
XN = csr_matrix(chunk[num_col].values)
X = csr_matrix((chunk.shape[0], n_features))
rows = np.arange(chunk.shape[0])
for col in cat_col_all:
dat = np.ones(chunk.shape[0])
cols = chunk[col] % n_features
X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
X = hstack((XN, X))
book_indices = sw[sw > 1].index.tolist()
x_indices=[(x-count) for x in book_indices]
X_test = csr_matrix(X)[x_indices]
y_test = y[book_indices]
clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw)
count = count + chunksize
map5 = map5eval(clf.predict_proba(X_test), y_test)
print((count, map5),' have done')
if(count/chunksize == 200):
break
except Exception as e:
count = count + chunksize
print(str(e))
pass
with open('sgd.pkl', 'wb') as f:
pickle.dump(clf, f)
count = 0
chunksize = 10000
preds = np.empty((0,100))
reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
chunk.drop(['id'], axis=1, inplace=True)
pre_process(chunk)
XN = csr_matrix(chunk[num_col].values)
X = csr_matrix((chunk.shape[0], n_features))
rows = np.arange(chunk.shape[0])
for col in cat_col_all:
dat = np.ones(chunk.shape[0])
cols = chunk[col] % n_features
X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
X = hstack((XN, X))
pred = clf.predict_proba(X)
preds = np.vstack((preds, pred))
count = count + chunksize
print(count,' have done')
del clf
if os.path.exists('sgd.h5'):
with h5py.File('sgd.h5', 'r+') as hf:
predshf = hf['preds']
predshf[...] = preds
else:
with h5py.File('sgd.h5', 'w') as hf:
hf.create_dataset('preds', data=preds)
col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]
submit = pd.DataFrame(data=hc, index=submission.id)
submit.reset_index(inplace=True)
submit.columns = submission.columns
submit.to_csv('sgdlr_deal.csv', index=False)
4.blend
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import h5py
submission = pd.read_csv('sample_submission.csv')
# read in RF results
with h5py.File('rf.h5', 'r') as hf:
predshf = hf['preds_latest']
preds = 0.54*normalize(predshf.value, norm='l1', axis=1)
# read in SGD results
with h5py.File('../output/probs/allpreds_sgd.h5', 'r') as hf:
predshf = hf['preds']
preds += 0.46*normalize(predshf.value, norm='l1', axis=1)
col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]
sub = pd.DataFrame(data=hc, index=submission.id)
sub.reset_index(inplace=True)
sub.columns = submission.columns
sub.to_csv('blend_deal.csv', index=False)
5.stack
# -*- coding: utf-8 -*-
import pandas as pd
match_pred = pd.read_csv('leakage_deal.csv')
match_pred.fillna('', inplace=True)
match_pred = match_pred['hotel_cluster'].tolist()
match_pred = [s.split(' ') for s in match_pred]
pred_sub = pd.read_csv('blend_deal.csv')
ids = pred_sub.id
pred_sub = pred_sub['hotel_cluster'].tolist()
pred_sub = [s.split(' ') for s in pred_sub]
def f0(seq, idfun=None):
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
if (marker in seen) or (marker == ''): continue
seen[marker] = 1
result.append(item)
return result
full_preds = [f0(match_pred[p] + pred_sub[p])[:5] for p in range(len(pred_sub))]
write_p = [" ".join([str(l) for l in p]) for p in full_preds]
write_frame = ["{0},{1}".format(ids[i], write_p[i]) for i in range(len(full_preds))]
write_frame = ["id,hotel_cluster"] + write_frame
with open("final_predictions.csv", "w+") as f:
f.write("\n".join(write_frame))
结果:
image.png
image.png