生信学习R语言学习

Rdata002 使用dplyr包进行数据转换

2020-09-16  本文已影响0人  caoqiansheng

使用nycflights13数据包来进行说明,这个数据框包含了2013年从纽约市出发的所有336776次航班的信息,该数据来自美国交通统计局,可以使用?flights查看说明文档

1.数据准备

library(tidyverse)
library(nycflights13)
head(flights)
# A tibble: 6 x 19
#    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
#   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
# 1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
# 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
# 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
# 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
# 5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
# 6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5     58
# ... with 1 more variable: time_hour <dttm>

# 查看整个数据
View(flights)

其中

2. dplyr基础函数

2.1 filter() 按行筛选

filter()可以基于观测的值筛选出一个观测的子集

filter(flights,month==1,day==1)
# A tibble: 842 x 19
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
#   1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
# 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
# 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
# 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
# 5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
# 6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5     58
# 7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6      0
# 8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0
# 9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6      0
# 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
# # ... with 832 more rows, and 1 more variable: time_hour <dttm>

# dplyr函数不会修改输入,因此需要保存函数结果,需要使用赋值操作符 <-
jan1 <- filter(flights,month==1,day==1)

# R要么输出结果,要么将结果保存在一个变量中,如果想要同时完成这两种操作,那么需要用括号()将赋值语句括起来
(dec25 <- filter(flights,month==12,day==25))
# # A tibble: 719 x 19
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest 
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
#   1  2013    12    25      456            500        -4      649            651        -2 US        1895 N156UW  EWR    CLT  
# 2  2013    12    25      524            515         9      805            814        -9 UA        1016 N32404  EWR    IAH  
# 3  2013    12    25      542            540         2      832            850       -18 AA        2243 N5EBAA  JFK    MIA  
# 4  2013    12    25      546            550        -4     1022           1027        -5 B6         939 N665JB  JFK    BQN  
# 5  2013    12    25      556            600        -4      730            745       -15 AA         301 N3JLAA  LGA    ORD  
# 6  2013    12    25      557            600        -3      743            752        -9 DL         731 N369NB  LGA    DTW  
# 7  2013    12    25      557            600        -3      818            831       -13 DL         904 N397DA  LGA    ATL  
# 8  2013    12    25      559            600        -1      855            856        -1 B6         371 N608JB  LGA    FLL  
# 9  2013    12    25      559            600        -1      849            855        -6 B6         605 N536JB  EWR    FLL  
# 10  2013    12    25      600            600         0      850            846         4 B6         583 N746JB  JFK    MCO  
# # ... with 709 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
2.1.1 比较运算符
# 返回值令人目瞪口呆
1/49*49==1
# [1] FALSE

# 正确的打开方式
near(1/49*49,1)
# [1] TRUE
2.1.2 逻辑运算符

& 与,相交
| 或,相合
!

# 如果想要过滤得到11月与12月出发的所有航班
filter(flights,month == 11 | month == 12)
# # A tibble: 55,403 x 19
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest 
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
#   1  2013    11     1        5           2359         6      352            345         7 B6         745 N568JB  JFK    PSE  
# 2  2013    11     1       35           2250       105      123           2356        87 B6        1816 N353JB  JFK    SYR  
# 3  2013    11     1      455            500        -5      641            651       -10 US        1895 N192UW  EWR    CLT  
# 4  2013    11     1      539            545        -6      856            827        29 UA        1714 N38727  LGA    IAH  
# 5  2013    11     1      542            545        -3      831            855       -24 AA        2243 N5CLAA  JFK    MIA  
# 6  2013    11     1      549            600       -11      912            923       -11 UA         303 N595UA  JFK    SFO  
# 7  2013    11     1      550            600       -10      705            659         6 US        2167 N748UW  LGA    DCA  
# 8  2013    11     1      554            600        -6      659            701        -2 US        2134 N742PS  LGA    BOS  
# 9  2013    11     1      554            600        -6      826            827        -1 DL         563 N912DE  LGA    ATL  
# 10  2013    11     1      554            600        -6      749            751        -2 DL         731 N315NB  LGA    DTW  
# # ... with 55,393 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

filter(flights,month == 11 | 12)
# # A tibble: 336,776 x 19
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest 
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
#   1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH  
# 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH  
# 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA  
# 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN  
# 5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL  
# 6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD  
# 7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL  
# 8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD  
# 9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO  
# 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD  
# # ... with 336,766 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# #   time_hour <dttm>

# 11 | 12返回的是逻辑值
11 | 12
# [1] TRUE

# 查看一下flights$month == 11 | 12的统计情况
table(flights$month == 11 | 12)
# TRUE 
# 336776

# 查看一下flights$month == 11 | flights$month == 12的统计情况
table(flights$month == 11 | flights$month == 12)
# FALSE   TRUE 
# 281373  55403 

# x %in% y,选出x是y中一个值的所有行
# 所以month %in% c(11,12)与month == 11 | month == 12是等价的
filter(flights,month %in% c(11,12))
# # A tibble: 55,403 x 19
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest 
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
#   1  2013    11     1        5           2359         6      352            345         7 B6         745 N568JB  JFK    PSE  
# 2  2013    11     1       35           2250       105      123           2356        87 B6        1816 N353JB  JFK    SYR  
# 3  2013    11     1      455            500        -5      641            651       -10 US        1895 N192UW  EWR    CLT  
# 4  2013    11     1      539            545        -6      856            827        29 UA        1714 N38727  LGA    IAH  
# 5  2013    11     1      542            545        -3      831            855       -24 AA        2243 N5CLAA  JFK    MIA  
# 6  2013    11     1      549            600       -11      912            923       -11 UA         303 N595UA  JFK    SFO  
# 7  2013    11     1      550            600       -10      705            659         6 US        2167 N748UW  LGA    DCA  
# 8  2013    11     1      554            600        -6      659            701        -2 US        2134 N742PS  LGA    BOS  
# 9  2013    11     1      554            600        -6      826            827        -1 DL         563 N912DE  LGA    ATL  
# 10  2013    11     1      554            600        -6      749            751        -2 DL         731 N315NB  LGA    DTW  
# # ... with 55,393 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>



# 使用德摩根定律可以将复杂的筛选条件进行简化
# !(x & y) 等价于 !x | !y
# !(x | y) 等价于 !x & !y


# 如果想要找出延误时间(到达活出发)不多于两小时的航班,可以使用以下两种筛选方式
filter(flights,!(arr_delay > 120 | dep_delay > 120))

filter(flights,arr_delay <= 120,dep_delay <= 120)

布尔运算全集
2.1.3 缺失值 NA

NA在R中表示未知的值,NA是可传染的,如果运算中包含了NA,那么计算的结果也是NA

NA > 5
# [1] NA

10 == NA
# [1] NA

NA + 10
# [1] NA

NA /2
# [1] NA

is.na()可以用来判断是否为NA

y=c(1,NA,2)
is.na(y)
# [1] FALSE TRUE FALSE  

# tibble包是一个轻量级的包,它实现的data.frame的重新塑造,保留了data.frame中经过实践证明有效的部分,吸取了专注于数据操作的dplyr包的基本思想。tibble包提供了更优于data.frame的性能
(df <- tibble(y=c(1,NA,2)))
# # A tibble: 3 x 1
#       y
#     <dbl>
# 1     1
# 2    NA
# 3     2

# filter()只能筛选出条件为TRUE的行,它会排除那些条件为FALSE和NA的行
filter(df,y>=1)
# # A tibble: 2 x 1
#       y
#     <dbl>
# 1     1
# 2     2

# 如果需要保留NA值,需要在筛选条件中指明
filter(df,is.na(y) | y>=1)
# A tibble: 3 x 1
#       y
#     <dbl>
# 1     1
# 2    NA
# 3     2

2.2 arrange() 重排

arrange()与filter() 非常相似

# 如果列名不止一个,会将后面的列在前面列的基础上进行排序
arrange(flights,year,month,day)
# # A tibble: 336,776 x 19
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
#   1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
# 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
# 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
# 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
# 5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
# 6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5     58
# 7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6      0
# 8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0
# 9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6      0
# 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
# # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>

# 使用desc函数进行降序排列
arrange(flights,desc(month))
# # A tibble: 336,776 x 19
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
#   1  2013    12     1       13           2359        14      446            445         1 B6         745 N715JB  JFK    PSE        195     1617    23     59
# 2  2013    12     1       17           2359        18      443            437         6 B6         839 N593JB  JFK    BQN        186     1576    23     59
# 3  2013    12     1      453            500        -7      636            651       -15 US        1895 N197UW  EWR    CLT         86      529     5      0
# 4  2013    12     1      520            515         5      749            808       -19 UA        1487 N69804  EWR    IAH        193     1400     5     15
# 5  2013    12     1      536            540        -4      845            850        -5 AA        2243 N634AA  JFK    MIA        144     1089     5     40
# 6  2013    12     1      540            550       -10     1005           1027       -22 B6         939 N821JB  JFK    BQN        189     1576     5     50
# 7  2013    12     1      541            545        -4      734            755       -21 EV        3819 N13968  EWR    CVG         95      569     5     45
# 8  2013    12     1      546            545         1      826            835        -9 UA        1441 N23708  LGA    IAH        204     1416     5     45
# 9  2013    12     1      549            600       -11      648            659       -11 US        2167 N945UW  LGA    DCA         42      214     6      0
# 10  2013    12     1      550            600       -10      825            854       -29 B6         605 N706JB  EWR    FLL        140     1065     6      0
# # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>

# df <- tibble(y=c(1,NA,2))
# 重新排序时,NA总是排在最后
arrange(df,y)
# # A tibble: 3 x 1
#       y
#     <dbl>
# 1     1
# 2     2
# 3    NA

# 对y进行降序
arrange(df,desc(y))
# # A tibble: 3 x 1
#       y
#     <dbl>
# 1     2
# 2     1
# 3    NA

2.3 select() 选择特定的列
# 按名称选择列year,month,day
select(flights,year,month,day)
# # A tibble: 336,776 x 3
# year month   day
# <int> <int> <int>
#   1  2013     1     1
# 2  2013     1     1
# 3  2013     1     1
# 4  2013     1     1
# 5  2013     1     1
# 6  2013     1     1
# 7  2013     1     1
# 8  2013     1     1
# 9  2013     1     1
# 10  2013     1     1
# # ... with 336,766 more rows

# 选择在year:day之间的所有列
select(flights,year:day)
# # A tibble: 336,776 x 3
# year month   day
# <int> <int> <int>
#   1  2013     1     1
# 2  2013     1     1
# 3  2013     1     1
# 4  2013     1     1
# 5  2013     1     1
# 6  2013     1     1
# 7  2013     1     1
# 8  2013     1     1
# 9  2013     1     1
# 10  2013     1     1
# # ... with 336,766 more rows

# 选择不在year:day之间的所有列
select(flights,-(year:day))
# # A tibble: 336,776 x 16
# dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute time_hour          
# <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl> <dttm>             
#   1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5     15 2013-01-01 05:00:00
# 2      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5     29 2013-01-01 05:00:00
# 3      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40 2013-01-01 05:00:00
# 4      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45 2013-01-01 05:00:00
# 5      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6      0 2013-01-01 06:00:00
# 6      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5     58 2013-01-01 05:00:00
# 7      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6      0 2013-01-01 06:00:00
# 8      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0 2013-01-01 06:00:00
# 9      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6      0 2013-01-01 06:00:00
# 10      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0 2013-01-01 06:00:00
# # ... with 336,766 more rows

select()函数可以搭配使用一些辅助函数:

  • starts_with(“abc”):匹配以abc开头的名称
  • end_with(abc"):匹配以abc结尾的名称
  • contains(“xyz”):匹配含xyz的名称
  • matches(“”)
  • num_range(“x”,1:3):匹配x1,x2,x3

# 使用select()函数的变种rename()进行重命名
rename(flights,YEAR=year)
# # A tibble: 336,776 x 19
# YEAR month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
#   n<int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
# 1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
# 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
# 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
# 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
# 5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
# 6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5     58
# 7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6      0
# 8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0
# 9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6      0
# 10 2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
# # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>


# select()与everything()连用可以将某几列移到数据库的开头
select(flights,day,dep_time,everything())
# # A tibble: 336,776 x 19
# day dep_time  year month sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
#    <int>    <int> <int> <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
# 1     1      517  2013     1            515         2      830            819        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
# 2     1      533  2013     1            529         4      850            830        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
# 3     1      542  2013     1            540         2      923            850        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
# 4     1      544  2013     1            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
# 5     1      554  2013     1            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
# 6     1      554  2013     1            558        -4      740            728        12 UA        1696 N39463  EWR    ORD        150      719     5     58
# 7     1      555  2013     1            600        -5      913            854        19 B6         507 N516JB  EWR    FLL        158     1065     6      0
# 8     1      557  2013     1            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0
# 9     1      557  2013     1            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO        140      944     6      0
# 10    1      558  2013     1            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
# # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>


2.4 mutate() 添加新变量

除了筛选现有的列,我们还需要添加新列
mutate()总是将新列添加在数据框最后

(flights_sml <- select(flights,year:day,ends_with("delay"),distance,air_time))
# # A tibble: 336,776 x 7
# year month   day dep_delay arr_delay distance air_time
#   <int>   <int>  <int>    <dbl>    <dbl>    <dbl>    <dbl>
# 1  2013     1     1         2        11     1400      227
# 2  2013     1     1         4        20     1416      227
# 3  2013     1     1         2        33     1089      160
# 4  2013     1     1        -1       -18     1576      183
# 5  2013     1     1        -6       -25      762      116
# 6  2013     1     1        -4        12      719      150
# 7  2013     1     1        -5        19     1065      158
# 8  2013     1     1        -3       -14      229       53
# 9  2013     1     1        -3        -8      944      140
# 10 2013     1     1        -2         8      733      138
# # ... with 336,766 more rows

mutate(flights_sml,gain=arr_delay-dep_delay,speed=distance/air_time*60)
# # A tibble: 336,776 x 9
# year month   day dep_delay arr_delay distance air_time  gain speed
#   <int>  <int>  <int>     <dbl>    <dbl>    <dbl>    <dbl> <dbl> <dbl>
# 1  2013     1     1         2        11     1400      227     9  370.
# 2  2013     1     1         4        20     1416      227    16  374.
# 3  2013     1     1         2        33     1089      160    31  408.
# 4  2013     1     1        -1       -18     1576      183   -17  517.
# 5  2013     1     1        -6       -25      762      116   -19  394.
# 6  2013     1     1        -4        12      719      150    16  288.
# 7  2013     1     1        -5        19     1065      158    24  404.
# 8  2013     1     1        -3       -14      229       53   -11  259.
# 9  2013     1     1        -3        -8      944      140    -5  405.
# 10 2013     1     1        -2         8      733      138    10  319.
# # ... with 336,766 more rows

# 如果只想保留新变量,可以使用transmute()
transmute(flights,gain=arr_delay - dep_delay,hours=air_time/60,gain_per_hour=gain/hours)
# # A tibble: 336,776 x 3
# gain hours gain_per_hour
#   <dbl> <dbl>         <dbl>
# 1     9 3.78           2.38
# 2    16 3.78           4.23
# 3    31 2.67          11.6 
# 4   -17 3.05          -5.57
# 5   -19 1.93          -9.83
# 6    16 2.5            6.4 
# 7    24 2.63           9.11
# 8   -11 0.883        -12.5 
# 9    -5 2.33          -2.14
# 10   10 2.3            4.35
# # ... with 336,766 more rows

2.4.1 常用创建函数

创建新变量的多种函数可以同mutate()一同使用,需要注意的是,这种函数必须是向量化的,也即必须要接收一个向量的输入,并返回一个向量作为输出,而且输入与输出具有同样数目的分量

10%/%3
# [1] 3
10%%3
# [1] 1
lead(seq(1:20))
 [1]  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 NA
lead(seq(1:20),3)
 [1]  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 NA NA NA

lag(seq(1:20))
 [1] NA  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
lag(seq(1:20),3)
 [1] NA NA NA  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17

cumsum(seq(1:10))
# [1]  1  3  6 10 15 21 28 36 45 55
cumprod(seq(1:5))
# [1]   1   2   6  24 120
cummean(seq(1:5))
# [1] 1.0 1.5 2.0 2.5 3.0
z <- c(1,2,2,NA,3,4)
min_rank(z)
# [1]  1  2  2 NA  4  5
min_rank(desc(z))
# [1]  5  3  3 NA  2  1
2.5 group_by(),summarize() 分组

summarize()必须要与group_by()函数一起使用

summarise(flights,delay=mean(dep_delay, na.rm = T))
# A tibble: 1 x 1
#   delay
#   <dbl>
# 1  12.6

by.day <- group_by(flights,year,month,day)
summarise(by.day,delay=mean(dep_delay,na.rm = T))
`summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
# # A tibble: 365 x 4
# # Groups:   year, month [12]
#   year   month   day delay
#   <int>   <int> <int> <dbl>
# 1  2013     1     1 11.5 
# 2  2013     1     2 13.9 
# 3  2013     1     3 11.0 
# 4  2013     1     4  8.95
# 5  2013     1     5  5.73
# 6  2013     1     6  7.15
# 7  2013     1     7  5.42
# 8  2013     1     8  2.55
# 9  2013     1     9  2.28
# 10 2013     1    10  2.84
# # ... with 355 more rows

2.6 %>% 管道符
2.6.1 用法:

x %>% f(y) == f(x,y)
x %>% f(y) %>% g(z) == g(f(x,y),z)

使用管道符,可以更清晰的看到数据的转换过程,使得代码具有更好的可读性,可以将%>%读作“然后”

by.dest <- group_by(flights,dest)
delay <- summarise(by.dest,count=n(),dist=mean(distance,na.rm = T),delay=mean(arr_delay,na.rm = T))
(delay <- filter(delay,count>20,dest != "HNL"))
# # A tibble: 96 x 4
#   dest  count  dist delay
#   <chr> <int> <dbl> <dbl>
# 1 ABQ     254 1826   4.38
# 2 ACK     265  199   4.85
# 3 ALB     439  143  14.4 
# 4 ATL   17215  757. 11.3 
# 5 AUS    2439 1514.  6.02
# 6 AVL     275  584.  8.00
# 7 BDL     443  116   7.05
# 8 BGR     375  378   8.03
# 9 BHM     297  866. 16.9 
# 10 BNA    6333  758. 11.8 
# # ... with 86 more rows

# 下述两种方式,可以与以上输出得到相同的结果
by.dest <- group_by(flights,dest) %>%
  summarise(count=n(),dist=mean(distance,na.rm = T),delay=mean(arr_delay,na.rm = T)) %>%
  filter(count>20,dest != "HNL"))

# 将代码()扩出来,可以同时完成赋值及输出
(# 赋值
by.dest <- flights %>%
# 然后分组
group_by(dest) %>%
#然后统计,输出 
summarise(count=n(),dist=mean(distance,na.rm = T),delay=mean(arr_delay,na.rm = T)) %>%
  filter(count>20,dest != "HNL") )
2.6.2 缺失值
(
flights %>%
  group_by(year,month,day) %>%
  summarise(mean=mean(dep_delay))
  )            
# # A tibble: 365 x 4
# # Groups:   year, month [12]
# year month   day  mean
# <int> <int> <int> <dbl>
#   1  2013     1     1    NA
# 2  2013     1     2    NA
# 3  2013     1     3    NA
# 4  2013     1     4    NA
# 5  2013     1     5    NA
# 6  2013     1     6    NA
# 7  2013     1     7    NA
# 8  2013     1     8    NA
# 9  2013     1     9    NA
# 10  2013     1    10    NA
# # ... with 355 more rows

# 聚合函数在使用的时候:如果输入值为NA,那么输出值也是NA
# 通过设置na.rm = T,可以在计算前出去缺失值NA
(
  flights %>%
    group_by(year,month,day) %>%
    summarise(mean=mean(dep_delay,na.rm = T))
  )  
# # A tibble: 365 x 4
# # Groups:   year, month [12]
# year month   day  mean
# <int> <int> <int> <dbl>
#   1  2013     1     1 11.5 
# 2  2013     1     2 13.9 
# 3  2013     1     3 11.0 
# 4  2013     1     4  8.95
# 5  2013     1     5  5.73
# 6  2013     1     6  7.15
# 7  2013     1     7  5.42
# 8  2013     1     8  2.55
# 9  2013     1     9  2.28
# 10  2013     1    10  2.84
# # ... with 355 more rows

# 本例中,缺失值NA表示取消的航班,所以我们可以通过先去除取消的航班来解决缺失值问题

not_cancelled <- flights %>%
  filter(!is.na(dep_delay),!is.na(arr_delay)) 
not_cancelled %>%
    group_by(year,month,day) %>%
    summarise(mean=mean(dep_delay))

# # A tibble: 365 x 4
# # Groups:   year, month [12]
#   year month   day  mean
#   <int> <int> <int> <dbl>
# 1  2013     1     1 11.4 
# 2  2013     1     2 13.7 
# 3  2013     1     3 10.9 
# 4  2013     1     4  8.97
# 5  2013     1     5  5.73
# 6  2013     1     6  7.15
# 7  2013     1     7  5.42
# 8  2013     1     8  2.56
# 9  2013     1     9  2.30
# 10  2013     1    10  2.84
# # ... with 355 more rows


2.6.3 计数n()

聚合函数操作中包括一个计数n() 或者非缺失值的计数sum(!is.na()),可以帮助我们检查是否基于非常少量的数据做出结论

 delays <- not_cancelled %>%
      group_by(tailnum) %>%
      summarise(delay=mean(arr_delay))
    ggplot(delays,aes(x=delay)) + geom_freqpoly(binwidth=10)

image.png
    delays <- not_cancelled %>%
      group_by(tailnum) %>%
      summarise(delay=mean(arr_delay,na.rm = T),n=n())
    ggplot(delays,aes(x=n,y=delay)) + geom_point(alpha=1/10)

image.png
delays %>%
  filter(n<600 & n>20) %>%
  ggplot(aes(x=n,y=delay)) + geom_point(alpha=1/10)

delays %>%
  filter(!(n>600|n<20)) %>%
  ggplot(aes(x=n,y=delay)) + geom_point(alpha=1/10)

delays %>%
filter(n<=600,n>=20) %>%
  ggplot(aes(x=n,y=delay)) + geom_point(alpha=1/10)
image.png
ctrl+shift+P可以将上一次发送至控制台的代码段重新发送一次,在本实验中,可以用于多次修改n值后,查看输出的图形
2.7 常用的摘要函数

只是使用均值,计数和求和是远远不够的

2.7.1 位置度量 median(x)

mean(x):均值
median(x):中位数,指的是50%的数会大于x,50%的数会小于x

not_cancelled  %>%
  group_by(year,month,day) %>%
  summarise(
    ave_delay1=mean(arr_delay),
    ave_delay2=mean(arr_delay[arr_delay>0])
  )
  # # A tibble: 365 x 5
  # # Groups:   year, month [12]
  # year month   day ave_delay1 ave_delay2
  # <int> <int> <int>      <dbl>      <dbl>
  #   1  2013     1     1     12.7         32.5
  # 2  2013     1     2     12.7         32.0
  # 3  2013     1     3      5.73        27.7
  # 4  2013     1     4     -1.93        28.3
  # 5  2013     1     5     -1.53        22.6
  # 6  2013     1     6      4.24        24.4
  # 7  2013     1     7     -4.95        27.8
  # 8  2013     1     8     -3.23        20.8
  # 9  2013     1     9     -0.264       25.6
  # 10  2013     1    10     -5.90        27.3
  # # ... with 355 more rows

2.7.2 分散程度度量 sd(x), IQR(x), mad(x)

sd(x) 标准误差
IQR(x) 四分位距
mad(x) 绝对中位差

not_cancelled  %>%
  group_by(dest) %>%
  summarise(
    sd=sd(distance),
    IQR=IQR(distance),
    mad=mad(distance)
  ) %>%
arrange(desc(sd))  
   
# # A tibble: 104 x 4
#   dest     sd   IQR   mad
#   <chr> <dbl> <dbl> <dbl>
# 1 EGE   10.5     21  1.48
# 2 SAN   10.4     21  0   
# 3 SFO   10.2     21  0   
# 4 HNL   10.0     20  0   
# 5 SEA    9.98    20  0   
# 6 LAS    9.91    21  0   
# 7 PDX    9.87    20  0   
# 8 PHX    9.86    20  0   
# 9 LAX    9.66    21  0   
# 10 IND    9.46    20  0   
2.7.3 秩的度量 min(x), quantitle(x,0.25), max(x)
not_cancelled  %>%
  group_by(year,month,day) %>%
  summarise(
    first=min(dep_time),
    last=max(dep_time)
  ) 
# # A tibble: 365 x 5
# # Groups:   year, month [12]
#   year month   day first  last
#   <int> <int> <int> <int> <int>
# 1  2013     1     1   517  2356
# 2  2013     1     2    42  2354
# 3  2013     1     3    32  2349
# 4  2013     1     4    25  2358
# 5  2013     1     5    14  2357
# 6  2013     1     6    16  2355
# 7  2013     1     7    49  2359
# 8  2013     1     8   454  2351
# 9  2013     1     9     2  2252
# 10  2013     1    10     3  2320
# # ... with 355 more rows  
  
2.7.4 定位度量 first(x), nth(x), last(x)
not_cancelled  %>%
  group_by(year,month,day) %>%
  summarise(
    first=first(dep_time),
    nth=nth(dep_time,1),
    last=last(dep_time)
  )

# # A tibble: 365 x 6
# # Groups:   year, month [12]
#   year month   day first   nth  last
#   <int> <int> <int> <int> <int> <int>
# 1  2013     1     1   517   517  2356
# 2  2013     1     2    42    42  2354
# 3  2013     1     3    32    32  2349
# 4  2013     1     4    25    25  2358
# 5  2013     1     5    14    14  2357
# 6  2013     1     6    16    16  2355
# 7  2013     1     7    49    49  2359
# 8  2013     1     8   454   454  2351
# 9  2013     1     9     2     2  2252
# 10  2013     1    10     3     3  2320
# # ... with 355 more rows
2.7.5 计数 n()
not_cancelled  %>%
  group_by(dest) %>%
  summarise(
    carrier.na = sum(!is.na(carrier)),
    carriers = n_distinct(carrier),
    )
# # A tibble: 104 x 3
#   dest  carrier.na carriers
#   <chr>      <int>    <int>
# 1 ABQ          254        1
# 2 ACK          264        1
# 3 ALB          418        1
# 4 ANC            8        1
# 5 ATL        16837        7
# 6 AUS         2411        6
# 7 AVL          261        2
# 8 BDL          412        2
# 9 BGR          358        1
# 10 BHM          269        1
# # ... with 94 more rows  

not_cancelled  %>%
  count(dest) 
# # A tibble: 104 x 2
#   dest      n
#   <chr> <int>
# 1 ABQ     254
# 2 ACK     264
# 3 ALB     418
# 4 ANC       8
# 5 ATL   16837
# 6 AUS    2411
# 7 AVL     261
# 8 BDL     412
# 9 BGR     358
# 10 BHM     269
# # ... with 94 more rows

not_cancelled  %>%
  count(tailnum,wt=distance)
# # A tibble: 4,037 x 2
#   tailnum      n
#   <chr>    <dbl>
# 1 D942DN    3418
# 2 N0EGMQ  239143
# 3 N10156  109664
# 4 N102UW   25722
# 5 N103US   24619
# 6 N104UW   24616
# 7 N10575  139903
# 8 N105UW   23618
# 9 N107US   21677
# 10 N108UW   32070

逻辑值的计数及比例

not_cancelled  %>%
  group_by(year,month,day) %>%
  summarise(n_early = sum(dep_time<500),
            hour_perc=mean(arr_delay>60))
# # A tibble: 365 x 5
# # Groups:   year, month [12]
#   year  month   day n_early hour_perc
#   <int> <int> <int>   <int>     <dbl>
# 1  2013     1     1       0    0.0722
# 2  2013     1     2       3    0.0851
# 3  2013     1     3       4    0.0567
# 4  2013     1     4       3    0.0396
# 5  2013     1     5       3    0.0349
# 6  2013     1     6       2    0.0470
# 7  2013     1     7       2    0.0333
# 8  2013     1     8       1    0.0213
# 9  2013     1     9       3    0.0202
# 10  2013     1    10       3    0.0183
# # ... with 355 more rows
2.8 按多个变量分组

当使用多个变量进行分组时,每次的摘要统计会用掉一个分组变量,后面的变量会在前面变量的基础上循序渐进

(daily <- group_by(flights,year,month,day))
# # A tibble: 336,776 x 19
# # Groups:   year, month, day [365]
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest 
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
#   1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH  
# 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH  
# 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA  
# 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN  
# 5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL  
# 6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD  
# 7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL  
# 8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD  
# 9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO  
# 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD  
# # ... with 336,766 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# #   time_hour <dttm>



# 取消分组
daily %>%  ungroup()
# # A tibble: 336,776 x 19
# year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest 
# <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
#   1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH  
# 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH  
# 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA  
# 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN  
# 5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL  
# 6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD  
# 7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL  
# 8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD  
# 9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO  
# 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD  
# # ... with 336,766 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# #   time_hour <dttm>

2.9 分组新变量

group()可以与mutate(),filter(),select(),arrange()等函数一起使用

flights %>%
  group_by(year,month,day) %>%
  filter(rank(desc(arr_delay))<=10)
# # A tibble: 3,609 x 19
# # Groups:   year, month, day [365]
#   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest 
#   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
# 1  2013     1     1      848           1835       853     1001           1950       851 MQ        3944 N942MQ  JFK    BWI  
# 2  2013     1     1     1815           1325       290     2120           1542       338 EV        4417 N17185  EWR    OMA  
# 3  2013     1     1     1842           1422       260     1958           1535       263 EV        4633 N18120  EWR    BTV  
# 4  2013     1     1     1938           1703       155     2109           1823       166 EV        4300 N18557  EWR    RIC  
# 5  2013     1     1     1942           1705       157     2124           1830       174 MQ        4410 N835MQ  JFK    DCA  
# 6  2013     1     1     2006           1630       216     2230           1848       222 EV        4644 N14972  EWR    SAV  
# 7  2013     1     1     2115           1700       255     2330           1920       250 9E        3347 N924XJ  JFK    CVG  
# 8  2013     1     1     2205           1720       285       46           2040       246 AA        1999 N5DNAA  EWR    MIA  
# 9  2013     1     1     2312           2000       192       21           2110       191 EV        4312 N13958  EWR    DCA  
# 10  2013     1     1     2343           1724       379      314           1938       456 EV        4321 N21197  EWR    MCI  
# # ... with 3,599 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>




flights %>%
  group_by(dest) %>%
  filter(n()>365)
# # A tibble: 332,577 x 19
# # Groups:   dest [77]
#   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest 
#   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
# 1  2013     1     1      517            515         2      830            819        11 UA        1545 N14228  EWR    IAH  
# 2  2013     1     1      533            529         4      850            830        20 UA        1714 N24211  LGA    IAH  
# 3  2013     1     1      542            540         2      923            850        33 AA        1141 N619AA  JFK    MIA  
# 4  2013     1     1      544            545        -1     1004           1022       -18 B6         725 N804JB  JFK    BQN  
# 5  2013     1     1      554            600        -6      812            837       -25 DL         461 N668DN  LGA    ATL  
# 6  2013     1     1      554            558        -4      740            728        12 UA        1696 N39463  EWR    ORD  
# 7  2013     1     1      555            600        -5      913            854        19 B6         507 N516JB  EWR    FLL  
# 8  2013     1     1      557            600        -3      709            723       -14 EV        5708 N829AS  LGA    IAD  
# 9  2013     1     1      557            600        -3      838            846        -8 B6          79 N593JB  JFK    MCO  
# 10  2013     1     1      558            600        -2      753            745         8 AA         301 N3ALAA  LGA    ORD  
# # ... with 332,567 more rows, and 5 more variables: air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# #   time_hour <dttm>

Reference

R数据科学,哈德利·威克姆,加勒特·格罗勒芒德等 著

上一篇下一篇

猜你喜欢

热点阅读