三大树模型
2020-08-17 本文已影响0人
一只当归
lgb1
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import gc
oof = np.zeros(len(X_train))
preds = np.zeros(len(X_test))
params = {
'objective':'binary',
'learning_rate': 0.007,
"boosting_type": "gbdt",
'num_leaves':256,
'tree_learner':'serial',
"subsample": 0.7,
"subsample_freq": 1,
"metric": 'auc',
'max_bin':255,
'colsample_bytree': 0.5, #0.4
#'categorical_feature': cat_cols
}
skf = GroupKFold(n_splits = 6)
for i, (idxT, idxV) in enumerate( skf.split(X_train, Y_train, groups=X_train['DT_M']) ):
month = X_train.iloc[idxV]['DT_M'].iloc[0]
print('Fold',i,'withholding month',month)
print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
train_data = lgb.Dataset(X_train[cols].iloc[idxT], label= Y_train.iloc[idxT])
val_data = lgb.Dataset(X_train[cols].iloc[idxV], label=Y_train.iloc[idxV])
clf = lgb.train(params, train_set=train_data, num_boost_round=50000, valid_sets=[train_data, val_data],
valid_names=['train', 'valid'], early_stopping_rounds=100, feval=None, verbose_eval=200)
oof[idxV] += clf.predict(X_train[cols].iloc[idxV])
preds += clf.predict(X_test[cols])/skf.n_splits
print ('XGB95 OOF CV=',roc_auc_score(Y_train,oof/5))
lig2
lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=100, reg_alpha=3, reg_lambda=5, max_depth=-1,
n_estimators=5000, objective='binary', subsample=0.9, colsample_bytree=0.77, subsample_freq=1, learning_rate=0.05,
random_state=1000, n_jobs=16, min_child_weight=4, min_child_samples=5, min_split_gain=0,class_weight = {0:1,1:2.5})
skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)
best_score = []
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test_id.shape[0])
for index, (train_index, test_index) in enumerate(skf.split(train, label)):
lgb_model.fit(train.iloc[train_index], label.iloc[train_index], verbose=50,
eval_set=[(train.iloc[train_index], label.iloc[train_index]),
(train.iloc[test_index], label.iloc[test_index])], early_stopping_rounds=30)
best_score.append(lgb_model.best_score_['valid_1']['binary_logloss'])
print(best_score)
oof_preds[test_index] = lgb_model.predict_proba(train.iloc[test_index], num_iteration=lgb_model.best_iteration_)[:,1]
test_pred = lgb_model.predict_proba(test, num_iteration=lgb_model.best_iteration_)[:, 1]
sub_preds += test_pred / 5
#print('test mean:', test_pred.mean())
#predict_result['predicted_score'] = predict_result['predicted_score'] + test_pred
m = tpr_weight_funtion(y_predict=oof_preds,y_true=label)
print(m[1])
sub = pd.read_csv('submit.csv')
sub['Tag'] = sub_preds
sub.to_csv('sub/baseline_%s.csv'%str(m),index=False)
随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import gc
clf = RandomForestClassifier (n_estimators=10, criterion='gini', max_depth=8,class_weight={0:1,1:10})
oof = np.zeros(len(X_train))
preds = np.zeros(len(X_test))
skf = GroupKFold(n_splits = 6)
for i, (idxT, idxV) in enumerate( skf.split(X_train, Y_train, groups=X_train['DT_M']) ):
month = X_train.iloc[idxV]['DT_M'].iloc[0]
print('Fold',i,'withholding month',month)
print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
clf.fit(X_train[cols].iloc[idxT],Y_train.iloc[idxT])
oof[idxV] += clf.predict_proba(X_train[cols].iloc[idxV])[:,1]
preds += clf.predict_proba(X_test[cols])[:,1]/skf.n_splits
xgboost
if BUILD96:
oof = np.zeros(len(X_train))
preds = np.zeros(len(X_test))
skf = GroupKFold(n_splits=6)
for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M']) ):
month = X_train.iloc[idxV]['DT_M'].iloc[0]
print('Fold',i,'withholding month',month)
print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
clf = xgb.XGBClassifier(
n_estimators=5000,
max_depth=12,
learning_rate=0.02,
subsample=0.8,
colsample_bytree=0.4,
missing=-1,
eval_metric='auc',
# USE CPU
#nthread=4,
#tree_method='hist'
# USE GPU
tree_method='gpu_hist'
)
h = clf.fit(X_train[cols].iloc[idxT], y_train.iloc[idxT],
eval_set=[(X_train[cols].iloc[idxV],y_train.iloc[idxV])],
verbose=100, early_stopping_rounds=200)
oof[idxV] += clf.predict_proba(X_train[cols].iloc[idxV])[:,1]
preds += clf.predict_proba(X_test[cols])[:,1]/skf.n_splits
del h, clf
x=gc.collect()
print('#'*20)
print ('XGB96 OOF CV=',roc_auc_score(y_train,oof))