天池新人实战赛之[离线赛]_baseline_lgb
2020-10-07 本文已影响0人
AI信仰者
主要思路如下:
1、将实际情况简化,这里只通过预测日前两天的用户商品交互情况来预测。
为了预测19号的购买情况,我们就用17、18号的用户交互情况来预测。
所以,我们可以去根据16、17号的用户交互数据,以及18号用户的购买数据来学习出这个模型。
2、对用户进行打分(例如对某个商品浏览一次加1分,收藏加2分,加购物车加3分等等)、算一下浏览占总交互次数的比例。
浏览、收藏、加购物车、购买,对应取值分别是1、2、3、4。
因为UI表格里只含有浏览、收藏、加购物车以及购买这四个数据,想办法扩充特征
3、在18号不会购买的项的数据量比18号购买的项的数据量多得多,也就是正负样本比例相差得太大!
这对于模型学习是灾难性的,学习出的模型完全不能用。
所以,我们需要选出训练集里所有在18号购买了的,再选出同等数量或者相差不多的数量的不购买样本作为新的训练集
特征选择代码,按照顺序运行下去即可:
import os
from datetime import datetime
import pandas as pd
from utils.read_write import pdReadCsv, writeOneCsv
from utils.time_change import str_hour
os.chdir(r'E:\项目文件\天池新人实战赛之[离线赛]')
'''
将实际情况简化,这里只通过预测日前两天的用户商品交互情况来预测。
为了预测19号的购买情况,我们就用17、18号的用户交互情况来预测。
所以,我们可以去根据16、17号的用户交互数据,以及18号用户的购买数据来学习出这个模型。
在18号不会购买的项的数据量比18号购买的项的数据量多得多,也就是正负样本比例相差得太大!
这对于模型学习是灾难性的,学习出的模型完全不能用。
所以,我们需要选出训练集里所有在18号购买了的,再选出同等数量或者相差不多的数量的不购买样本作为新的训练集
'''
# 只留下含有P表商品的交互记录,只去考虑P表内含有的那些类别的数据去进行学习
def merge_item_user():
item_file = 'tianchi_fresh_comp_train_item.csv'
user_file = 'tianchi_fresh_comp_train_user.csv'
item = pdReadCsv(item_file, ',')
user = pdReadCsv(user_file, ',')
merge = pd.merge(item, user, on='item_id')
merge.to_csv('merge_item.csv', index=False)
# 统计各类别总交互次数
def type_count1():
merge_item = pdReadCsv('merge_item.csv', ',')
groups = merge_item.groupby(['user_id', 'item_id', 'behavior_type', 'time']).size()
groups.name = 'type_count'
item1 = groups.reset_index()
item1.to_csv('type_count.csv')
# type_count1()
# 对用户进行打分(例如对某个商品浏览一次加1分,收藏加2分,加购物车加3分等等)、算一下浏览占总交互次数的比例。
# 浏览、收藏、加购物车、购买,对应取值分别是1、2、3、4。
def build_features_train():
data = pdReadCsv('type_count.csv', ',')
data['datetime'] = data['time'].map(lambda x: str_hour(x))
count = data[data['datetime'] > datetime(2014, 12, 16, 0)]
groups = count.groupby(['user_id', 'item_id'])
for user_item, group in groups:
fen = 0
for one in group.itertuples(index=False):
behavior_type = getattr(one, 'behavior_type')
type_count = getattr(one, 'type_count')
fen = fen + behavior_type * type_count
if behavior_type == 4:
group['label'] = 1
else:
group['label'] = 0
group['fen'] = fen
sum_count = group['type_count'].sum()
group['percent'] = group['type_count'].map(lambda x: round(x / sum_count, 4))
group.to_csv('features_train.csv', mode='a', header=False)
# build_features_train()
# 因为UI表格里只含有浏览、收藏、加购物车以及购买这四个数据,想办法扩充特征
def select_train():
data = pdReadCsv('features_train.csv', ',')
data['datetime'] = data['time'].map(lambda x: str_hour(x))
groups = data.groupby(['user_id', 'item_id'])
for name, group in groups:
count18 = group[group['datetime'] > datetime(2014, 12, 18, 0)]
if count18.shape[0] > 0:
count = group[group['datetime'] < datetime(2014, 12, 18, 0)]
if count.shape[0] > 0:
features = [name[0], name[1]]
type_g = count.groupby('behavior_type')
type_count_sum = type_g.agg({'type_count': sum})
type_percent = type_g.agg({'percent': sum})
item1 = type_count_sum.reset_index()
item2 = type_percent.reset_index()
features.append(item1[item1['behavior_type'] == 1]['type_count'].sum())
features.append(item1[item1['behavior_type'] == 2]['type_count'].sum())
features.append(item1[item1['behavior_type'] == 3]['type_count'].sum())
features.append(item1[item1['behavior_type'] == 4]['type_count'].sum())
features.append(features[2] + features[3] * 2 + features[4] * 3 - features[4] * 6)
features.append(item2[item2['behavior_type'] == 1]['percent'].sum())
features.append(item2[item2['behavior_type'] == 2]['percent'].sum())
features.append(item2[item2['behavior_type'] == 3]['percent'].sum())
features.append(item2[item2['behavior_type'] == 4]['percent'].sum())
if count18[count18['behavior_type'] == 4].shape[0] > 0:
features.append(1)
else:
features.append(0)
writeOneCsv(features, 'features_train_row.csv')
# select_train()
# 对用户进行打分(例如对某个商品浏览一次加1分,收藏加2分,加购物车加3分等等)、算一下浏览占总交互次数的比例。
# 浏览、收藏、加购物车、购买,对应取值分别是1、2、3、4。
def build_features_test():
data = pdReadCsv('type_count.csv', ',')
data['datetime'] = data['time'].map(lambda x: str_hour(x))
count = data[data['datetime'] > datetime(2014, 12, 17, 0)]
groups = count.groupby(['user_id', 'item_id'])
for user_item, group in groups:
fen = 0
for one in group.itertuples(index=False):
behavior_type = getattr(one, 'behavior_type')
type_count = getattr(one, 'type_count')
fen = fen + behavior_type * type_count
if behavior_type == 4:
group['label'] = 1
else:
group['label'] = 0
group['fen'] = fen
sum_count = group['type_count'].sum()
group['percent'] = group['type_count'].map(lambda x: round(x / sum_count, 4))
group = group[group['label'] == 0]
group.to_csv('features_test.csv', mode='a', header=False)
# build_features_test()
# 因为UI表格里只含有浏览、收藏、加购物车以及购买这四个数据,想办法扩充特征
def select_test():
data = pdReadCsv('features_test.csv', ',')
data['datetime'] = data['time'].map(lambda x: str_hour(x))
groups = data.groupby(['user_id', 'item_id'])
for name, group in groups:
features = [name[0], name[1]]
type_g = group.groupby('behavior_type')
type_count_sum = type_g.agg({'type_count': sum})
type_percent = type_g.agg({'percent': sum})
item1 = type_count_sum.reset_index()
item2 = type_percent.reset_index()
features.append(item1[item1['behavior_type'] == 1]['type_count'].sum())
features.append(item1[item1['behavior_type'] == 2]['type_count'].sum())
features.append(item1[item1['behavior_type'] == 3]['type_count'].sum())
features.append(item1[item1['behavior_type'] == 4]['type_count'].sum())
features.append(features[2] + features[3] * 2 + features[4] * 3 - features[4] * 6)
features.append(item2[item2['behavior_type'] == 1]['percent'].sum())
features.append(item2[item2['behavior_type'] == 2]['percent'].sum())
features.append(item2[item2['behavior_type'] == 3]['percent'].sum())
features.append(item2[item2['behavior_type'] == 4]['percent'].sum())
writeOneCsv(features, 'features_test_row.csv')
# select_test()
lgb模型训练和预测代码如下:
import os
import numpy as np
from sklearn.metrics import roc_auc_score
from utils.read_write import reduce_mem_usage, pdReadCsv
os.chdir(r'E:\项目文件\天池新人实战赛之[离线赛]')
data = pdReadCsv('features_train_row.csv', ',')
testA = pdReadCsv('features_test_row.csv', ',')
data = reduce_mem_usage(data)
from sklearn.model_selection import KFold
# 分离数据集,方便进行交叉验证 准确率: 0.8570669971167746
X_train = data.drop(['user_id', 'label'], axis=1)
X_test = testA.drop(['user_id'], axis=1)
y_train = data['label']
# 5折交叉验证
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
import lightgbm as lgb
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
print('************************************ {} ************************************'.format(str(i + 1)))
X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[
valid_index], y_train[valid_index]
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 114,
'max_depth': 29,
'min_data_in_leaf': 37,
'min_child_weight': 1.6,
'bagging_fraction': 0.98,
'feature_fraction': 0.69,
'bagging_freq': 96,
'reg_lambda': 9,
'reg_alpha': 7,
'min_split_gain': 0.4,
'boost_from_average': True,
'nthread': 8,
'seed': 2020,
}
model = lgb.train(params, train_set=train_matrix, num_boost_round=2469, valid_sets=valid_matrix, verbose_eval=1000,
early_stopping_rounds=400)
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
cv_scores.append(roc_auc_score(y_val, val_pred))
print(cv_scores)
print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))
predict_test = model.predict(X_test)
print(predict_test)
testA['pre'] = predict_test
testA = testA[testA['pre'] > 0.37588]
testA = testA[['user_id', 'item_id']]
testA.to_csv('mobile_recommendation_' + str(0.3758) + '.csv', index=False)