R学习

流式处理 dplyr 在R中的使用

2017-11-02  本文已影响0人  飘舞的鼻涕

注,任何疑问都返回 github blog留言,或者加QQ群..[174225475].. 共同探讨进步

基本函数解析

1. 分组和统计 (Grouping and counting)

summarise, tally, count, group_size, n_groups, ungroup, do

> a1 <- data.frame(a1=c(1,1,2,2,2,4),a2=letters[1:6],b3=c(0,1,5,7,NA,9))
> a1
  a1 a2 b3
1  1  a  0
2  1  b  1
3  2  c  5
4  2  d  7
5  2  e NA
6  4  f  9
> a1 %>% group_by(a1) %>% summarise(cnt1=n())
# A tibble: 3 x 2
     a1  cnt1
  <dbl> <int>
1     1     2
2     2     3
3     4     1
> a1 %>% group_by(a1) %>% tally()
# A tibble: 3 x 2
     a1     n
  <dbl> <int>
1     1     2
2     2     3
3     4     1
> a1 %>% count(a1)
# A tibble: 3 x 2
     a1     n
  <dbl> <int>
1     1     2
2     2     3
3     4     1
> a1 %>% group_by(a1) %>% summarise(cnt1=n()) %>% arrange(desc(cnt1))
# A tibble: 3 x 2
     a1  cnt1
  <dbl> <int>
1     2     3
2     1     2
3     4     1
> a1 %>% group_by(a1) %>% tally(sort=TRUE)
# A tibble: 3 x 2
     a1     n
  <dbl> <int>
1     2     3
2     1     2
3     4     1
> a1 %>% count(a1,sort=TRUE)
# A tibble: 3 x 2
     a1     n
  <dbl> <int>
1     2     3
2     1     2
3     4     1
> a1 %>% group_by(a1) %>% summarise(b31=sum(b3,na.rm=TRUE))
# A tibble: 3 x 2
     a1   b31
  <dbl> <dbl>
1     1     1
2     2    12
3     4     9
> a1 %>% group_by(a1) %>% tally(wt= b3 )
# A tibble: 3 x 2
     a1     n
  <dbl> <dbl>
1     1     1
2     2    12
3     4     9
> a1 %>% count(a1,wt= b3)
# A tibble: 3 x 2
     a1     n
  <dbl> <dbl>
1     1     1
2     2    12
3     4     9
> a1 %>% group_by(a1) %>% group_size()
[1] 2 3 1
> a1 %>% group_by(a1) %>% n_groups()
[1] 3
> 
> a1 %>% group_by(a1) %>% summarise(cnt1=n()) %>% ungroup() %>% arrange(desc(cnt1))
# A tibble: 3 x 2
     a1  cnt1
  <dbl> <int>
1     2     3
2     1     2
3     4     1
> a1 %>% group_by(a1) %>% summarise(cnt1=n()) %>% arrange(desc(cnt1))
# A tibble: 3 x 2
     a1  cnt1
  <dbl> <int>
1     2     3
2     1     2
3     4     1
> a1 %>% group_by(a1) %>% do(tail(.,2)) # 提取每组的后2条数据
# A tibble: 5 x 3
# Groups:   a1 [3]
     a1     a2    b3
  <dbl> <fctr> <dbl>
1     1      a     0
2     1      b     1
3     2      d     7
4     2      e    NA
5     4      f     9
> t2 <- a1 %>% group_by(a1) %>% do(t1=summary(.$b3)) 
> t2
Source: local data frame [3 x 2]
Groups: <by row>

# A tibble: 3 x 2
     a1                   t1
* <dbl>               <list>
1     1 <S3: summaryDefault>
2     2 <S3: summaryDefault>
3     4 <S3: summaryDefault>
> t2[[2,'t1']]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    5.0     5.5     6.0     6.0     6.5     7.0       1 
> t2$t1
[[1]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    0.25    0.50    0.50    0.75    1.00 

[[2]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    5.0     5.5     6.0     6.0     6.5     7.0       1 

[[3]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      9       9       9       9       9       9 

> t2 <- iris %>% group_by(Species) %>% do(mod=lm(Sepal.Length~Sepal.Width,data=.))
> t2
Source: local data frame [3 x 2]
Groups: <by row>

# A tibble: 3 x 2
     Species       t1
*     <fctr>   <list>
1     setosa <S3: lm>
2 versicolor <S3: lm>
3  virginica <S3: lm>
> t2$mod
[[1]]

Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = .)

Coefficients:
(Intercept)  Sepal.Width  
     2.6390       0.6905  

[[2]]

Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = .)

Coefficients:
(Intercept)  Sepal.Width  
     3.5397       0.8651  

[[3]]

Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = .)

Coefficients:
(Intercept)  Sepal.Width  
     3.9068       0.9015  

2. 列选取及更改列名 (Choosing columns)

select, rename

a1 <- data.frame(a1=c(1,1,2,4),a2=c('a','b','c','d'),b3=c(0,1,5,7))
a1
  a1 a2 b3
1  1  a  0
2  1  b  1
3  2  c  5
4  4  d  7

a1 %>% select(a1,b3) # 提取 a1,b3列
  a1 b3
1  1  0
2  1  1
3  2  5
4  4  7
a1 %>% select(-a1,-b3) # 排除 a1,b3列
  a2
1  a
2  b
3  c
4  d
a1 %>% select(a1:b3) # 提取a1到b3之间的所有列
  a1 a2 b3
1  1  a  0
2  1  b  1
3  2  c  5
4  4  d  7
a1 %>% select(-contains('a')) # 排除列名称包含'a'的所有列
  b3
1  0
2  1
3  5
4  7
a1 %>% select(one_of(c('a1','b3'))) # 提取a1,b3列;等同于select(a1,b3)
  a1 b3
1  1  0
2  1  1
3  2  5
4  4  7
a1 %>% rename(a3=b3) # 修改列名并 保留 其他列
  a1 a2 a3
1  1  a  0
2  1  b  1
3  2  c  5
4  4  d  7
a1 %>% select(a3=b3) # 修改列名但 排除 其他列
  a3
1  0
2  1
3  5
4  7

3. 行选取及样本量统计 (Choosing rows)

filter,between,slice,top_n,sample_n,distinct,n_distinct

a1 <- data.frame(a1=c(1,1,1,2,2,2,2,4),a2=letters[1:8],b3=c(NA,3,5,7,9,NA,6,8))
> a1
  a1 a2 b3
1  1  a NA
2  1  b  3
3  1  c  5
4  2  d  7
5  2  e  9
6  2  f NA
7  2  g  6
8  4  h  8
> a1 %>% filter(a1>1) # 筛选 a1列值 >1 的样本数据
  a1 a2 b3
1  2  d  7
2  2  e  9
3  2  f NA
4  2  g  6
5  4  h  8
> a1 %>% filter(a1>1,a1<4) # 筛选 a1列值 >1 且 <4 的样本数据
  a1 a2 b3
1  2  d  7
2  2  e  9
3  2  f NA
4  2  g  6
> a1 %>% filter(between(a1,1,4)) # 筛选 a1列值 >=1 且 =<4 的样本数据
  a1 a2 b3
1  1  a NA
2  1  b  3
3  1  c  5
4  2  d  7
5  2  e  9
6  2  f NA
7  2  g  6
8  4  h  8
> a1 %>% filter(between(a1,2,3)) # 筛选 a1列值 >=2 且 =<3 的样本数据
  a1 a2 b3
1  2  d  7
2  2  e  9
3  2  f NA
4  2  g  6
> a1 %>% filter(!is.na(b3)) # 筛选 b3列值 不等于NA 的样本数据
  a1 a2 b3
1  1  b  3
2  1  c  5
3  2  d  7
4  2  e  9
5  2  g  6
6  4  h  8
> a1 %>% slice(1:3) # 筛选 默认排序下的 1至3 行样本
# A tibble: 3 x 3
     a1     a2    b3
  <dbl> <fctr> <dbl>
1     1      a    NA
2     1      b     3
3     1      c     5
> a1 %>% top_n(2,b3) # 筛选 按照 b3列倒序排列后 值最大的前 2 个样本
  a1 a2 b3
1  2  e  9
2  4  h  8
> a1 %>% group_by(a1)%>% slice(1:2) # 筛选 按照啊a1分组后每组默认排序下的 1至3 行样本
# A tibble: 5 x 3
# Groups:   a1 [3]
     a1     a2    b3
  <dbl> <fctr> <dbl>
1     1      a    NA
2     1      b     3
3     2      d     7
4     2      e     9
5     4      h     8
> a1 %>% group_by(a1)%>% top_n(2,b3) # 筛选 按照啊a1分组后每组b3列倒序下的 1至2 行样本
# A tibble: 5 x 3
# Groups:   a1 [3]
     a1     a2    b3
  <dbl> <fctr> <dbl>
1     1      b     3
2     1      c     5
3     2      d     7
4     2      e     9
5     4      h     8
> a1 %>% group_by(a1)%>% arrange(desc(b3))%>% slice(1:2) # 筛选 按照啊a1分组后每组b3列倒序下的 1至2 行样本
# A tibble: 5 x 3
# Groups:   a1 [3]
     a1     a2    b3
  <dbl> <fctr> <dbl>
1     1      c     5
2     1      b     3
3     2      e     9
4     2      d     7
5     4      h     8
> a1 %>% group_by(a1)%>% sample_n(2,replace=TRUE) # 筛选 按照啊a1分组后每组默认排序下的 1至2 行样本; replace=true 实现样本数不够2自动重复抽样
# A tibble: 6 x 3
# Groups:   a1 [3]
     a1     a2    b3
  <dbl> <fctr> <dbl>
1     1      a    NA
2     1      b     3
3     2      e     9
4     2      g     6
5     4      h     8
6     4      h     8
> a1 %>% select(a1)%>% unique() # 筛选 a1 列的唯一项
  a1
1  1
4  2
8  4
> a1 %>% select(a1)%>% distinct() # 筛选 a1 列的唯一项
  a1
1  1
2  2
3  4
> a1 %>% distinct(a1) # 筛选 a1 列的唯一项
  a1
1  1
2  2
3  4
> 
> a1 %>% select(a1)%>% n_distinct() # 统计 a1 列的唯一项个数
[1] 3
> a1 %>% group_by(a1) %>% summarise(cnt1=n_distinct(a1))  # 按照a1分组 统计 a1 列的唯一项个数
# A tibble: 3 x 2
     a1  cnt1
  <dbl> <int>
1     1     1
2     2     1
3     4     1
> a1 %>% group_by(a1) %>% summarise(cnt1=n())  # 按照a1分组 统计 各组样本个数
# A tibble: 3 x 2
     a1  cnt1
  <dbl> <int>
1     1     3
2     2     4
3     4     1

4. 新变量生成 (Adding new variables)

mutate,transmute,add_rownames

> a1 %>% mutate(a201=toupper(a2),b301=b3/a1) # 在保留原数据基础上 生成新数据列
  a1 a2 b3 a201 b301
1  1  a NA    A   NA
2  1  b  3    B  3.0
3  1  c  5    C  5.0
4  2  d  7    D  3.5
5  2  e  9    E  4.5
6  2  f NA    F   NA
7  2  g  6    G  3.0
8  4  h  8    H  2.0
> a1 %>% transmute(a201=toupper(a2),b301=b3/a1) # 在丢弃原数据基础上 只生成新数据列
  a201 b301
1    A   NA
2    B  3.0
3    C  5.0
4    D  3.5
5    E  4.5
6    F   NA
7    G  3.0
8    H  2.0
> a1 %>% head()
  a1 a2 b3
1  1  a NA
2  1  b  3
3  1  c  5
4  2  d  7
5  2  e  9
6  2  f NA
> a1 %>% add_rownames('model') %>% head() # 在原有数据基础上 将rownames(dt)添加至'model'列
# A tibble: 6 x 4
  model    a1     a2    b3
  <chr> <dbl> <fctr> <dbl>
1     1     1      a    NA
2     2     1      b     3
3     3     1      c     5
4     4     2      d     7
5     5     2      e     9
6     6     2      f    NA
Warning message:
Deprecated, use tibble::rownames_to_column() instead. 
> a1 %>% tbl_df()
# A tibble: 8 x 3
     a1     a2    b3
  <dbl> <fctr> <dbl>
1     1      a    NA
2     1      b     3
3     1      c     5
4     2      d     7
5     2      e     9
6     2      f    NA
7     2      g     6
8     4      h     8

5. 数据集合并 (merge (join) tables)

left/right_join,inner/full_join,semi/anti_join

> a1 <- data.frame(a1=c(1,1,2,4),a2=letters[1:4],a3=c(5,7,9,NA))
> b1 <- data.frame(a1=c(1,2,2,3),a2=letters[2:5])
> a1;b1
  a1 a2 a3
1  1  a  5
2  1  b  7
3  2  c  9
4  4  d NA
# ----------
  a1 a2
1  1  b
2  2  c
3  2  d
4  3  e

> a1 %>% left_join(b1,by='a1') # 左连接,以左侧数据集为基础将 右侧 数据补充进 左侧 数据集
  a1 a2.x a3 a2.y
1  1    a  5    b
2  1    b  7    b
3  2    c  9    c
4  2    c  9    d
5  4    d NA <NA>
> a1 %>% right_join(b1,by='a1') # 右连接,以 右侧 数据集为基础将 左侧 数据补充进 右侧 数据集
  a1 a2.x a3 a2.y
1  1    a  5    b
2  1    b  7    b
3  2    c  9    c
4  2    c  9    d
5  3 <NA> NA    e
> a1 %>% inner_join(b1,by='a1') # 内连接,以两数据集公共部分为基础进行数据集合并
  a1 a2.x a3 a2.y
1  1    a  5    b
2  1    b  7    b
3  2    c  9    c
4  2    c  9    d
> a1 %>% full_join(b1,by='a1') # 全连接,以两数据集并集为基础进行数据集合并
  a1 a2.x a3 a2.y
1  1    a  5    b
2  1    b  7    b
3  2    c  9    c
4  2    c  9    d
5  4    d NA <NA>
6  3 <NA> NA    e
> a1 %>% semi_join(b1,by=c('a1','a2')) # 保留与 b1 数据集a1,a2字段 重合 的 a1 数据集部分
  a1 a2 a3
1  1  b  7
2  2  c  9
Warning message:
Column `a2` joining factors with different levels, coercing to character vector 
> a1 %>% anti_join(b1,by=c('a1','a2')) # 保留与 b1 数据集a1,a2字段 不重合 的 a1 数据集部分
  a1 a2 a3
1  4  d NA
2  1  a  5
Warning message:
Column `a2` joining factors with different levels, coercing to character vector 

实例解析

1. 利用 mutate 更新字段信息

a1=data.frame(b1=c(1,2,3),b2=c(NA,NA,1))
a1
  b1 b2
1  1 NA
2  2 NA
3  3  1
a1 <- a1 %>% mutate(b2=ifelse(is.na(b2),'tmp3',b2))
a1
  b1   b2
1  1 tmp3
2  2 tmp3
3  3    1

2. 查看组内billtime平均差值

dif1 <- bill_all %>% select(userid,bill_time) %>% # 提取保留字段
  mutate(bill_time=as.numeric(bill_time)) %>% #转化数据类型
  filter(!bill_time==0) %>%  # 筛选条件数据
  group_by(userid) %>%  # 分组
  summarise(dif1=mean(diff(bill_time))) # 计算平均差值

3. 两表拼接并生成新列

bill_all3 <- bill_all2 %>% 
  left_join(loantime_all,by='userid') %>% # 通过左连接将loantime补备入bill2表中
  mutate(be2af=ifelse(bill_time>loan_time,1,0)) # 通过ifelse判断并将结果生成新字段be2af

4. by客户,计算其历次借贷交易值是否有异常

last_debt_rang <- bill_all3 %>% 
  group_by(userid) %>% # 按照客户id分组
  filter(last_debt > 0) %>% # 筛选正向欠款
  summarise(q25=quantile(last_debt,0.25), # 下四分位
            q75=quantile(last_debt,0.75), # 上四分位
            q50=q75-q25, # 四分位距
            ul=q75+1.5*q50, # 上限
            ll=q25-1.5*q50) # 下限
last_debt_lof <- bill_all3 %>% add_rownames('rowids1') %>% # 添加ids列
  select(rowids1,userid,last_debt) %>% # 缩小数据集
  left_join(last_debt_rang%>% select(userid,ll,ul),by='userid') %>% 补备上下限信息
  mutate(debt_lof=ifelse(last_debt>ul,1,0)) # 判断交易值是否异常

注意dplyr的神奇,新生成的变量立马就能被后面的计算无缝引用

5. 待补备中...

上一篇下一篇

猜你喜欢

热点阅读