python/flask

python与xgb做特征重要性分析

2016-04-11  本文已影响4360人  trieyouth

代码

import pandas as pd
import xgboost as xgb
import operator

def get_data():
    train = pd.read_csv("first_result2.csv")
    #这里我只有12个特征
    features = list(train.columns[:11])
    y_train = train['target']
    #数据缺失值补全
    for feat in train.select_dtypes(include=['object']).columns:
        m = train.groupby([feat])['target'].mean()
        train[feat].replace(m,inplace=True)
    x_train = train[features]
    return x_train, y_train

 x_train, y_train = get_data()

#这里的参数自己改
xgb_params = {'booster':'gbtree','objective': 'binary:logistic', "eta": 0.01, "max_depth": 5,  "silent": 0,"colsample_bytree":0.7}
num_rounds = 1000

dtrain = xgb.DMatrix(x_train, label=y_train)
gbdt = xgb.train(xgb_params, dtrain, num_rounds)
importance = gbdt.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

print importance

结果

[('gender', 578), ('is_sys', 1202), ('is_font_cem', 1448), ('is_sup_cem', 1507), ('ite_phone_num', 1669), ('is_dou_kard', 1729), ('is_auto', 1796), ('age', 2235), ('now_month', 2596), (' avg_flow', 2914), ('avr_cost', 4149)]
上一篇下一篇

猜你喜欢

热点阅读