流式处理 dplyr 在R中的使用
2017-11-02 本文已影响0人
飘舞的鼻涕
注,任何疑问都返回 github blog留言,或者加QQ群..[174225475].. 共同探讨进步
基本函数解析
1. 分组和统计 (Grouping and counting)
summarise, tally, count, group_size, n_groups, ungroup, do
> a1 <- data.frame(a1=c(1,1,2,2,2,4),a2=letters[1:6],b3=c(0,1,5,7,NA,9))
> a1
a1 a2 b3
1 1 a 0
2 1 b 1
3 2 c 5
4 2 d 7
5 2 e NA
6 4 f 9
> a1 %>% group_by(a1) %>% summarise(cnt1=n())
# A tibble: 3 x 2
a1 cnt1
<dbl> <int>
1 1 2
2 2 3
3 4 1
> a1 %>% group_by(a1) %>% tally()
# A tibble: 3 x 2
a1 n
<dbl> <int>
1 1 2
2 2 3
3 4 1
> a1 %>% count(a1)
# A tibble: 3 x 2
a1 n
<dbl> <int>
1 1 2
2 2 3
3 4 1
> a1 %>% group_by(a1) %>% summarise(cnt1=n()) %>% arrange(desc(cnt1))
# A tibble: 3 x 2
a1 cnt1
<dbl> <int>
1 2 3
2 1 2
3 4 1
> a1 %>% group_by(a1) %>% tally(sort=TRUE)
# A tibble: 3 x 2
a1 n
<dbl> <int>
1 2 3
2 1 2
3 4 1
> a1 %>% count(a1,sort=TRUE)
# A tibble: 3 x 2
a1 n
<dbl> <int>
1 2 3
2 1 2
3 4 1
> a1 %>% group_by(a1) %>% summarise(b31=sum(b3,na.rm=TRUE))
# A tibble: 3 x 2
a1 b31
<dbl> <dbl>
1 1 1
2 2 12
3 4 9
> a1 %>% group_by(a1) %>% tally(wt= b3 )
# A tibble: 3 x 2
a1 n
<dbl> <dbl>
1 1 1
2 2 12
3 4 9
> a1 %>% count(a1,wt= b3)
# A tibble: 3 x 2
a1 n
<dbl> <dbl>
1 1 1
2 2 12
3 4 9
> a1 %>% group_by(a1) %>% group_size()
[1] 2 3 1
> a1 %>% group_by(a1) %>% n_groups()
[1] 3
>
> a1 %>% group_by(a1) %>% summarise(cnt1=n()) %>% ungroup() %>% arrange(desc(cnt1))
# A tibble: 3 x 2
a1 cnt1
<dbl> <int>
1 2 3
2 1 2
3 4 1
> a1 %>% group_by(a1) %>% summarise(cnt1=n()) %>% arrange(desc(cnt1))
# A tibble: 3 x 2
a1 cnt1
<dbl> <int>
1 2 3
2 1 2
3 4 1
> a1 %>% group_by(a1) %>% do(tail(.,2)) # 提取每组的后2条数据
# A tibble: 5 x 3
# Groups: a1 [3]
a1 a2 b3
<dbl> <fctr> <dbl>
1 1 a 0
2 1 b 1
3 2 d 7
4 2 e NA
5 4 f 9
> t2 <- a1 %>% group_by(a1) %>% do(t1=summary(.$b3))
> t2
Source: local data frame [3 x 2]
Groups: <by row>
# A tibble: 3 x 2
a1 t1
* <dbl> <list>
1 1 <S3: summaryDefault>
2 2 <S3: summaryDefault>
3 4 <S3: summaryDefault>
> t2[[2,'t1']]
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
5.0 5.5 6.0 6.0 6.5 7.0 1
> t2$t1
[[1]]
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.25 0.50 0.50 0.75 1.00
[[2]]
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
5.0 5.5 6.0 6.0 6.5 7.0 1
[[3]]
Min. 1st Qu. Median Mean 3rd Qu. Max.
9 9 9 9 9 9
> t2 <- iris %>% group_by(Species) %>% do(mod=lm(Sepal.Length~Sepal.Width,data=.))
> t2
Source: local data frame [3 x 2]
Groups: <by row>
# A tibble: 3 x 2
Species t1
* <fctr> <list>
1 setosa <S3: lm>
2 versicolor <S3: lm>
3 virginica <S3: lm>
> t2$mod
[[1]]
Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = .)
Coefficients:
(Intercept) Sepal.Width
2.6390 0.6905
[[2]]
Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = .)
Coefficients:
(Intercept) Sepal.Width
3.5397 0.8651
[[3]]
Call:
lm(formula = Sepal.Length ~ Sepal.Width, data = .)
Coefficients:
(Intercept) Sepal.Width
3.9068 0.9015
2. 列选取及更改列名 (Choosing columns)
select, rename
a1 <- data.frame(a1=c(1,1,2,4),a2=c('a','b','c','d'),b3=c(0,1,5,7))
a1
a1 a2 b3
1 1 a 0
2 1 b 1
3 2 c 5
4 4 d 7
a1 %>% select(a1,b3) # 提取 a1,b3列
a1 b3
1 1 0
2 1 1
3 2 5
4 4 7
a1 %>% select(-a1,-b3) # 排除 a1,b3列
a2
1 a
2 b
3 c
4 d
a1 %>% select(a1:b3) # 提取a1到b3之间的所有列
a1 a2 b3
1 1 a 0
2 1 b 1
3 2 c 5
4 4 d 7
a1 %>% select(-contains('a')) # 排除列名称包含'a'的所有列
b3
1 0
2 1
3 5
4 7
a1 %>% select(one_of(c('a1','b3'))) # 提取a1,b3列;等同于select(a1,b3)
a1 b3
1 1 0
2 1 1
3 2 5
4 4 7
a1 %>% rename(a3=b3) # 修改列名并 保留 其他列
a1 a2 a3
1 1 a 0
2 1 b 1
3 2 c 5
4 4 d 7
a1 %>% select(a3=b3) # 修改列名但 排除 其他列
a3
1 0
2 1
3 5
4 7
3. 行选取及样本量统计 (Choosing rows)
filter,between,slice,top_n,sample_n,distinct,n_distinct
a1 <- data.frame(a1=c(1,1,1,2,2,2,2,4),a2=letters[1:8],b3=c(NA,3,5,7,9,NA,6,8))
> a1
a1 a2 b3
1 1 a NA
2 1 b 3
3 1 c 5
4 2 d 7
5 2 e 9
6 2 f NA
7 2 g 6
8 4 h 8
> a1 %>% filter(a1>1) # 筛选 a1列值 >1 的样本数据
a1 a2 b3
1 2 d 7
2 2 e 9
3 2 f NA
4 2 g 6
5 4 h 8
> a1 %>% filter(a1>1,a1<4) # 筛选 a1列值 >1 且 <4 的样本数据
a1 a2 b3
1 2 d 7
2 2 e 9
3 2 f NA
4 2 g 6
> a1 %>% filter(between(a1,1,4)) # 筛选 a1列值 >=1 且 =<4 的样本数据
a1 a2 b3
1 1 a NA
2 1 b 3
3 1 c 5
4 2 d 7
5 2 e 9
6 2 f NA
7 2 g 6
8 4 h 8
> a1 %>% filter(between(a1,2,3)) # 筛选 a1列值 >=2 且 =<3 的样本数据
a1 a2 b3
1 2 d 7
2 2 e 9
3 2 f NA
4 2 g 6
> a1 %>% filter(!is.na(b3)) # 筛选 b3列值 不等于NA 的样本数据
a1 a2 b3
1 1 b 3
2 1 c 5
3 2 d 7
4 2 e 9
5 2 g 6
6 4 h 8
> a1 %>% slice(1:3) # 筛选 默认排序下的 1至3 行样本
# A tibble: 3 x 3
a1 a2 b3
<dbl> <fctr> <dbl>
1 1 a NA
2 1 b 3
3 1 c 5
> a1 %>% top_n(2,b3) # 筛选 按照 b3列倒序排列后 值最大的前 2 个样本
a1 a2 b3
1 2 e 9
2 4 h 8
> a1 %>% group_by(a1)%>% slice(1:2) # 筛选 按照啊a1分组后每组默认排序下的 1至3 行样本
# A tibble: 5 x 3
# Groups: a1 [3]
a1 a2 b3
<dbl> <fctr> <dbl>
1 1 a NA
2 1 b 3
3 2 d 7
4 2 e 9
5 4 h 8
> a1 %>% group_by(a1)%>% top_n(2,b3) # 筛选 按照啊a1分组后每组b3列倒序下的 1至2 行样本
# A tibble: 5 x 3
# Groups: a1 [3]
a1 a2 b3
<dbl> <fctr> <dbl>
1 1 b 3
2 1 c 5
3 2 d 7
4 2 e 9
5 4 h 8
> a1 %>% group_by(a1)%>% arrange(desc(b3))%>% slice(1:2) # 筛选 按照啊a1分组后每组b3列倒序下的 1至2 行样本
# A tibble: 5 x 3
# Groups: a1 [3]
a1 a2 b3
<dbl> <fctr> <dbl>
1 1 c 5
2 1 b 3
3 2 e 9
4 2 d 7
5 4 h 8
> a1 %>% group_by(a1)%>% sample_n(2,replace=TRUE) # 筛选 按照啊a1分组后每组默认排序下的 1至2 行样本; replace=true 实现样本数不够2自动重复抽样
# A tibble: 6 x 3
# Groups: a1 [3]
a1 a2 b3
<dbl> <fctr> <dbl>
1 1 a NA
2 1 b 3
3 2 e 9
4 2 g 6
5 4 h 8
6 4 h 8
> a1 %>% select(a1)%>% unique() # 筛选 a1 列的唯一项
a1
1 1
4 2
8 4
> a1 %>% select(a1)%>% distinct() # 筛选 a1 列的唯一项
a1
1 1
2 2
3 4
> a1 %>% distinct(a1) # 筛选 a1 列的唯一项
a1
1 1
2 2
3 4
>
> a1 %>% select(a1)%>% n_distinct() # 统计 a1 列的唯一项个数
[1] 3
> a1 %>% group_by(a1) %>% summarise(cnt1=n_distinct(a1)) # 按照a1分组 统计 a1 列的唯一项个数
# A tibble: 3 x 2
a1 cnt1
<dbl> <int>
1 1 1
2 2 1
3 4 1
> a1 %>% group_by(a1) %>% summarise(cnt1=n()) # 按照a1分组 统计 各组样本个数
# A tibble: 3 x 2
a1 cnt1
<dbl> <int>
1 1 3
2 2 4
3 4 1
4. 新变量生成 (Adding new variables)
mutate,transmute,add_rownames
> a1 %>% mutate(a201=toupper(a2),b301=b3/a1) # 在保留原数据基础上 生成新数据列
a1 a2 b3 a201 b301
1 1 a NA A NA
2 1 b 3 B 3.0
3 1 c 5 C 5.0
4 2 d 7 D 3.5
5 2 e 9 E 4.5
6 2 f NA F NA
7 2 g 6 G 3.0
8 4 h 8 H 2.0
> a1 %>% transmute(a201=toupper(a2),b301=b3/a1) # 在丢弃原数据基础上 只生成新数据列
a201 b301
1 A NA
2 B 3.0
3 C 5.0
4 D 3.5
5 E 4.5
6 F NA
7 G 3.0
8 H 2.0
> a1 %>% head()
a1 a2 b3
1 1 a NA
2 1 b 3
3 1 c 5
4 2 d 7
5 2 e 9
6 2 f NA
> a1 %>% add_rownames('model') %>% head() # 在原有数据基础上 将rownames(dt)添加至'model'列
# A tibble: 6 x 4
model a1 a2 b3
<chr> <dbl> <fctr> <dbl>
1 1 1 a NA
2 2 1 b 3
3 3 1 c 5
4 4 2 d 7
5 5 2 e 9
6 6 2 f NA
Warning message:
Deprecated, use tibble::rownames_to_column() instead.
> a1 %>% tbl_df()
# A tibble: 8 x 3
a1 a2 b3
<dbl> <fctr> <dbl>
1 1 a NA
2 1 b 3
3 1 c 5
4 2 d 7
5 2 e 9
6 2 f NA
7 2 g 6
8 4 h 8
5. 数据集合并 (merge (join) tables)
left/right_join,inner/full_join,semi/anti_join
> a1 <- data.frame(a1=c(1,1,2,4),a2=letters[1:4],a3=c(5,7,9,NA))
> b1 <- data.frame(a1=c(1,2,2,3),a2=letters[2:5])
> a1;b1
a1 a2 a3
1 1 a 5
2 1 b 7
3 2 c 9
4 4 d NA
# ----------
a1 a2
1 1 b
2 2 c
3 2 d
4 3 e
> a1 %>% left_join(b1,by='a1') # 左连接,以左侧数据集为基础将 右侧 数据补充进 左侧 数据集
a1 a2.x a3 a2.y
1 1 a 5 b
2 1 b 7 b
3 2 c 9 c
4 2 c 9 d
5 4 d NA <NA>
> a1 %>% right_join(b1,by='a1') # 右连接,以 右侧 数据集为基础将 左侧 数据补充进 右侧 数据集
a1 a2.x a3 a2.y
1 1 a 5 b
2 1 b 7 b
3 2 c 9 c
4 2 c 9 d
5 3 <NA> NA e
> a1 %>% inner_join(b1,by='a1') # 内连接,以两数据集公共部分为基础进行数据集合并
a1 a2.x a3 a2.y
1 1 a 5 b
2 1 b 7 b
3 2 c 9 c
4 2 c 9 d
> a1 %>% full_join(b1,by='a1') # 全连接,以两数据集并集为基础进行数据集合并
a1 a2.x a3 a2.y
1 1 a 5 b
2 1 b 7 b
3 2 c 9 c
4 2 c 9 d
5 4 d NA <NA>
6 3 <NA> NA e
> a1 %>% semi_join(b1,by=c('a1','a2')) # 保留与 b1 数据集a1,a2字段 重合 的 a1 数据集部分
a1 a2 a3
1 1 b 7
2 2 c 9
Warning message:
Column `a2` joining factors with different levels, coercing to character vector
> a1 %>% anti_join(b1,by=c('a1','a2')) # 保留与 b1 数据集a1,a2字段 不重合 的 a1 数据集部分
a1 a2 a3
1 4 d NA
2 1 a 5
Warning message:
Column `a2` joining factors with different levels, coercing to character vector
实例解析
1. 利用 mutate 更新字段信息
a1=data.frame(b1=c(1,2,3),b2=c(NA,NA,1))
a1
b1 b2
1 1 NA
2 2 NA
3 3 1
a1 <- a1 %>% mutate(b2=ifelse(is.na(b2),'tmp3',b2))
a1
b1 b2
1 1 tmp3
2 2 tmp3
3 3 1
2. 查看组内billtime平均差值
dif1 <- bill_all %>% select(userid,bill_time) %>% # 提取保留字段
mutate(bill_time=as.numeric(bill_time)) %>% #转化数据类型
filter(!bill_time==0) %>% # 筛选条件数据
group_by(userid) %>% # 分组
summarise(dif1=mean(diff(bill_time))) # 计算平均差值
3. 两表拼接并生成新列
bill_all3 <- bill_all2 %>%
left_join(loantime_all,by='userid') %>% # 通过左连接将loantime补备入bill2表中
mutate(be2af=ifelse(bill_time>loan_time,1,0)) # 通过ifelse判断并将结果生成新字段be2af
4. by客户,计算其历次借贷交易值是否有异常
last_debt_rang <- bill_all3 %>%
group_by(userid) %>% # 按照客户id分组
filter(last_debt > 0) %>% # 筛选正向欠款
summarise(q25=quantile(last_debt,0.25), # 下四分位
q75=quantile(last_debt,0.75), # 上四分位
q50=q75-q25, # 四分位距
ul=q75+1.5*q50, # 上限
ll=q25-1.5*q50) # 下限
last_debt_lof <- bill_all3 %>% add_rownames('rowids1') %>% # 添加ids列
select(rowids1,userid,last_debt) %>% # 缩小数据集
left_join(last_debt_rang%>% select(userid,ll,ul),by='userid') %>% 补备上下限信息
mutate(debt_lof=ifelse(last_debt>ul,1,0)) # 判断交易值是否异常
注意dplyr的神奇,新生成的变量立马就能被后面的计算无缝引用