python基础-24-数据分析python——pandas——

2019-04-02 本文已影响0人比特跃动

本章内容包括：用户消费行为RFM分层模型

//input1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
columns = ['user_id','order_dt','order_products','order_amount']
df = pd.read_table('CDNOW_master.txt',names = columns,sep = '\s+')
//user_id:用户ID
//order_dt:购买日期
//orderproducts:购买产品数
//order_amount:购买金额
df['order_dt'] = pd.to_datetime(df.order_dt,format='%Y%m%d')
df['month'] = df.order_dt.values.astype('datetime64[M]')
df.info()



//output1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69659 entries, 0 to 69658
Data columns (total 5 columns):
user_id           69659 non-null int64
order_dt          69659 non-null datetime64[ns]
order_products    69659 non-null int64
order_amount      69659 non-null float64
month             69659 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(2)
memory usage: 2.7 MB








//input2
df.head()



//output2
user_id order_dt    order_products  order_amount    month
0   1   1997-01-01  1   11.77   1997-01-01
1   2   1997-01-12  1   12.00   1997-01-01
2   2   1997-01-12  5   77.00   1997-01-01
3   3   1997-01-02  2   20.76   1997-01-01
4   3   1997-03-30  2   20.76   1997-03-01








//input3
grouped_user = df.groupby('user_id')
grouped_user.sum().describe()


//output3
    order_products  order_amount
count   23570.000000    23570.000000
mean    7.122656    106.080426
std 16.983531   240.925195
min 1.000000    0.000000
25% 1.000000    19.970000
50% 3.000000    43.395000
75% 7.000000    106.475000
max 1033.000000 13990.930000








//input4
rfm = df.pivot_table(index = 'user_id',
                    values = ['order_products','order_amount','order_dt'],
                    aggfunc = {'order_dt':'max',
                               'order_amount':'sum',
                               'order_products':'sum'
                    })
rfm.head()



//output4
order_amount    order_dt    order_products
user_id         
1   11.77   1997-01-01  1
2   89.00   1997-01-12  6
3   156.46  1998-05-28  16
4   100.50  1997-12-12  7
5   385.61  1998-01-03  29








//input5
rfm [['R','F','M']].apply(lambda x:x-x.mean()).head()



//output5
R   F   M
user_id         
1   177.778362  -6.122656   -94.310426
2   166.778362  -1.122656   -17.080426
3   -334.221638 8.877344    50.379574
4   -167.221638 -0.122656   -5.580426
5   -189.221638 21.877344   279.529574








//input6
rfm['R'] = -(rfm.order_dt - rfm.order_dt.max()) / np.timedelta64(1,'D')
rfm.rename(columns = {'order_amount':'M','order_products':'F'},inplace = True)
def rfm_func(x):
    level = x.apply(lambda x:'1' if x>=1 else '0')
    label = level.R +level.F +level.M
    d = {
        '111':'重要价值客户',
        '011':'重要保持客户',
        '101':'重要发展客户',
        '001':'重要挽留客户',
        '110':'一般价值客户',
        '010':'一般保持客户',
        '100':'一般发展客户',
        '000':'一般挽留客户',
    }
    result = d[label]
    return result

rfm['label'] = rfm [['R','F','M']].apply(lambda x:x-x.mean()).apply(rfm_func,axis = 1) #逐行应用
rfm.loc[rfm.label == '重要价值客户','color'] = 'g'
rfm.loc[~(rfm.label == '重要价值客户'),'color'] = 'r'
rfm.plot.scatter('F','R',c=rfm.color)









//output6

RFM

//input7
rfm.groupby('label').sum()



//output7
M   F   R
label           
一般价值客户  1767.11 182 8512.0
一般保持客户  5100.77 492 7782.0
一般发展客户  445233.28   29915   6983699.0
一般挽留客户  215075.77   15428   621894.0
重要价值客户  147180.09   9849    286676.0
重要保持客户  1555586.51  105509  476502.0
重要发展客户  49905.80    2322    174340.0
重要挽留客户  80466.30    4184    96009.0





//input8
rfm.groupby('label').count()



//output8
M   order_dt    F   R   color
label                   
一般价值客户  18  18  18  18  18
一般保持客户  53  53  53  53  53
一般发展客户  14138   14138   14138   14138   14138
一般挽留客户  3493    3493    3493    3493    3493
重要价值客户  631 631 631 631 631
重要保持客户  4267    4267    4267    4267    4267
重要发展客户  371 371 371 371 371
重要挽留客户  599 599 599 599 599

python基础-24-数据分析python——pandas——

猜你喜欢

热点阅读