python-pandas-常用代码

2020-05-11  本文已影响0人  logi

1. 读取数据

 user_feature = pd.read_csv(self.feature_path + '/underexpose_user_feat.csv',
                                            header=None,
                                            names=['user_id', 'user_age_level', 'user_gender', 'user_city_level'],
                                            na_values=['']
                                            )

  item_feature = pd.read_csv(path, sep=',\[|\],|,' ,engine='python',
                               header=None, names=['item_id'] +  ['emb_1_'+ str(i) for i in range(128)] + ['emb_2_' + str(i) for i in range(128)])



2. 存储数据

train_data.to_csv(self.sample_save_path + "_train", index=False, header=True)

3. group by

user_item_ = pos_data.groupby('user_id')['item_id'].agg(set).reset_index()
        user_item_dict = dict(zip(user_item_['user_id'], user_item_['item_id']))


4. merge

  pre_sample = pd.merge(d_a, self.item_feature, on='item_id', how='left')

5. fill na

def _fill_NA(df):
    rand_value = np.random.uniform(-1, 1, size=(128))
    values= {"user_id": -1, 'item_id': -1,
     'user_city_level': '-1',
     'user_age_level': '-1', 'user_gender': 'O'}

    emb_dict = {'emb_1_'+ str(i):0 for i in range(128) } # TODO:拥均值 or rankd ??
    emb_dict2=  {'emb_2_'+ str(i):0 for i in range(128) }
    emb_all = emb_dict.copy()
    emb_all.update(emb_dict2)

    values_merge = values.copy()
    values_merge.update(emb_all)

    return df.fillna(value=values_merge)
上一篇 下一篇

猜你喜欢

热点阅读