R语言与统计分析数据科学与R语言生物信息学与算法

[R语言] dpylr

2020-04-02  本文已影响0人  半为花间酒

《R for Data Science》第五章 Data transformation 啃书知识点积累

参考书籍

  1. 《R for data science》
  2. 《R数据科学》
library(nycflights13)
library(tidyverse)

dim(flights %>% 
  filter(!is.na(dep_time)))
# [1] 328521     19
dim(flights %>% 
  filter(!is.na(arr_time)))
# [1] 328063     19
dim(flights %>% 
  filter(!is.na(arr_delay)))
# [1] 327346     19

结论:
成功出发的航班 328521个
有到达延误数据的航班 327346个
成功到达的航班 328063个

分析:
有 458个 航班出发但未到达
有 717个 航班的到达延误数据丢失

操作:
重新修复717个数据,形成新数据框

# 根据多个条件获取arr_time和sched_arr_time相对于出发当天0点的绝对分钟数
new_flights <- flights %>% 
  mutate(
    arr_time_real = case_when(
      (sched_dep_time %/% 100 * 60 + sched_dep_time %% 100 + dep_delay > 1439) | 
        (arr_time < dep_time) ~ arr_time %/% 100 * 60 + arr_time %% 100 + 1440,
      !((sched_dep_time %/% 100 * 60 + sched_dep_time %% 100 + dep_delay > 1439) | 
          (arr_time < dep_time)) ~ arr_time %/% 100 * 60 + arr_time %% 100),
    arr_time_sched = case_when(
      sched_arr_time < sched_dep_time ~ 
        sched_arr_time %/% 100 * 60 + sched_arr_time %% 100 + 1440,
      sched_arr_time >= sched_dep_time ~
        sched_arr_time %/% 100 * 60 + sched_arr_time %% 100),
    arr_delay_real = arr_time_real - arr_time_sched)

# 用两个代码检验新的数据集
# 检查修复后存在到达延误数据的航班数
new_flights %>% 
  filter(!is.na(arr_delay_real))
# A tibble: 328,063 x 22

# 检验新求出的差值和原本存在的数据是否有差异,如果有差异说明该计算方法有错
new_flights %>% 
  filter(arr_delay_real == arr_delay)
# A tibble: 327,346 x 22

dplyr五个重要方法:

  • Pick observations by their values (filter()).
  • Reorder the rows (arrange()).
  • Pick variables by their names (select()).
  • Create new variables with functions of existing variables (mutate()).
  • Collapse many values down to a single summary (summarise()).

1. filter()

1 / 49 * 49 == 1
#> [1] FALSE

# 可以用near()解决
near(1 / 49 * 49, 1)
#> [1] TRUE
异或 xor(a, b) 相异为真
flights %>%
  filter(month %in% c(7:9))
# A tibble: 86,326 x 19

!(x & y) is the same as !x | !y, and !(x | y) is the same as !x & !y

filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)

filter() only includes rows where the condition is TRUE; it excludes both FALSE and NA values

df <- tibble(x = c(1, NA, 3))
filter(df, x > 1)
#> # A tibble: 1 x 1
#>       x
#>   <dbl>
#> 1     3

其他缺失值相关的补充

NA ^ 0
#> 1
NA | TRUE
#> TRUE
FALSE & NA
#> FALSE

if the the number is Inf, then the result of the multiplication will be NaN

NA * 0
#> NA

缺失值NA的默认类型是逻辑型

# between()表示一个变量落在两数之间,两边闭合区间

dplyr::filter(flights, dep_time >= 0 & dep_time <= 600)
# 等价于
dplyr::filter(flights, between(dep_time, 0, 600))

2. arrange()

Missing values are always sorted at the end

df <- tibble(x = c(5, 2, NA))
arrange(df, x)
#> # A tibble: 3 x 1
#>       x
#>   <dbl>
#> 1     2
#> 2     5
#> 3    NA
arrange(df, desc(x))
#> # A tibble: 3 x 1
#>       x
#>   <dbl>
#> 1     5
#> 2     2
#> 3    NA
dplyr::arrange(flights, desc(is.na(x), x))

# 如下
dplyr::arrange(flights, desc(is.na(dep_time), dep_time))
dplyr::arrange(iris,iris$Sepal.Length) # 结果行索引重置
iris[order(iris$Sepal.Length),] # 结果行索引不重置

3. select()

  1. starts_with("abc"): matches names that begin with “abc”.
  2. ends_with("xyz"): matches names that end with “xyz”.
  3. contains("ijk"): matches names that contain “ijk”.
  4. num_range("x", 1:3): matches x1, x2 and x3.
  5. matches("(.)\1"): selects variables that match a regular expression.
rename(flights, tail_num = tailnum)
#> # A tibble: 336,776 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
#> 1  2013     1     1      517            515         2      830            819
#> 2  2013     1     1      533            529         4      850            830
#> 3  2013     1     1      542            540         2      923            850
#> 4  2013     1     1      544            545        -1     1004           1022
#> 5  2013     1     1      554            600        -6      812            837
#> 6  2013     1     1      554            558        -4      740            728
#> # … with 3.368e+05 more rows, and 11 more variables: arr_delay <dbl>,
#> #   carrier <chr>, flight <int>, tail_num <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, time_hour, air_time, everything())
#> # A tibble: 336,776 x 19
#>   time_hour           air_time  year month   day dep_time sched_dep_time
#>   <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
#> 1 2013-01-01 05:00:00      227  2013     1     1      517            515
#> 2 2013-01-01 05:00:00      227  2013     1     1      533            529
#> 3 2013-01-01 05:00:00      160  2013     1     1      542            540
#> 4 2013-01-01 05:00:00      183  2013     1     1      544            545
#> 5 2013-01-01 06:00:00      116  2013     1     1      554            600
#> 6 2013-01-01 05:00:00      150  2013     1     1      554            558
#> # … with 3.368e+05 more rows, and 12 more variables: dep_delay <dbl>,
#> #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>

很实用,如果要把某一列移动到中间位置:

new_flights <-select(new_flights,year:arr_delay,
                     arr_delay_real,
                     everything())
vars <- c(
    "year", "month", "day", "dep_delay", "arr_delay",'no_exist')
dplyr::select(flights, one_of(vars))

# one_of()可以筛选向量中存在的列,如果全部存在则等价于
dplyr::select(flights, vars)

# 但如果有一列不存在则上述写法会报错,one_of()不会,只会出现警告

contains()本身不区分大小写

# 如果需要区分大小写可以在辅助函数内部加上ignore.case = FALSE
dplyr::select(flights, contains("TIME", ignore.case = FALSE))

4. mutate()

mutate(flights_sml,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)
#> # A tibble: 336,776 x 10
#>    year month   day dep_delay arr_delay distance air_time  gain hours
#>   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
#> 1  2013     1     1         2        11     1400      227    -9  3.78
#> 2  2013     1     1         4        20     1416      227   -16  3.78
#> 3  2013     1     1         2        33     1089      160   -31  2.67
#> 4  2013     1     1        -1       -18     1576      183    17  3.05
#> 5  2013     1     1        -6       -25      762      116    19  1.93
#> 6  2013     1     1        -4        12      719      150   -16  2.5 
#> # … with 3.368e+05 more rows, and 1 more variable: gain_per_hour <dbl>
transmute(flights,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)
#> # A tibble: 336,776 x 3
#>    gain hours gain_per_hour
#>   <dbl> <dbl>         <dbl>
#> 1    -9  3.78         -2.38
#> 2   -16  3.78         -4.23
#> 3   -31  2.67        -11.6 
#> 4    17  3.05          5.57
#> 5    19  1.93          9.83
#> 6   -16  2.5          -6.4 
#> # … with 3.368e+05 more rows

%/% (integer division) and %% (remainder)
where x == y * (x %/% y) + (x %% y)

  1. compute running differences (e.g. x - lag(x))
  2. find when values change (x != lag(x))
(x <- 1:10)
#>  [1]  1  2  3  4  5  6  7  8  9 10
lag(x)
#>  [1] NA  1  2  3  4  5  6  7  8  9
lead(x)
#>  [1]  2  3  4  5  6  7  8  9 10 NA

If you need rolling aggregates (i.e. a sum computed over a rolling window), try the RcppRoll package.

y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)
#> [1]  1  2  2 NA  4  5
min_rank(desc(y))
#> [1]  5  3  3 NA  2  1
  1. row_number 通用排名,并列的名次结果按先后顺序不一样,靠前出现的元素排名在前
  2. min_rank 通用排名,并列的名次结果一样,占用下一名次。
  3. dense_rank 中国式排名,并列排名不占用名次
  4. percent_rank 按百分比的排名
  5. cume_dist 累计分布区间的排名
  6. ntile 粗略地把向量按堆排名,n即是堆的数量
row_number(y)
#> [1]  1  2  3 NA  4  5
dense_rank(y)
#> [1]  1  2  2 NA  3  4
percent_rank(y)
#> [1] 0.00 0.25 0.25   NA 0.75 1.00
cume_dist(y)
#> [1] 0.2 0.6 0.6  NA 0.8 1.0

5. summarize() + group_by()

flights %>% 
  group_by( year, month, day) %>% 
  summarise(delay = mean(dep_delay, na.rm = TRUE))

Fortunately, all aggregation functions have an na.rm argument which removes the missing values prior to computation

Measures of spread:sd(x), IQR(x), mad(x)
The root mean squared deviation, or standard deviation sd(x), is the standard measure of spread.
The interquartile range IQR(x) and median absolute deviation mad(x) are robust equivalents that may be more useful if you have outliers.

Measures of rank: min(x), quantile(x, 0.25), max(x)
Quantiles are a generalisation of the median. For example, quantile(x, 0.25) will find a value of x that is greater than 25% of the values, and less than the remaining 75%.

y <- c(1, 3, 4, 3, 77, 100)
quantile(y, 0.25)
#> 25% 
#> 3 
quantile(y, 0.5)
#> 50% 
#> 3.5 

Measures of position: first(x), nth(x, 2), last(x)
These work similarly to x[1], x[2], and x[length(x)] but let you set a default value if that position does not exist.

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )
#> # A tibble: 365 x 5
#> # Groups:   year, month [12]
#>    year month   day first_dep last_dep
#>   <int> <int> <int>     <int>    <int>
#> 1  2013     1     1       517     2356
#> 2  2013     1     2        42     2354
#> 3  2013     1     3        32     2349
#> 4  2013     1     4        25     2358
#> 5  2013     1     5        14     2357
#> 6  2013     1     6        16     2355
#> # … with 359 more rows

nth可以查找,也可以设置默认值

y <- c(1, 3, 4, 3, 77, 100)
nth(y, 3)
#> 4
nth(y, 8)
#> NA
nth(y, 8, default=10)
#> 10
> y <- c(33, 65, 22, 34, 44, 77, 14)
> range(y)
#> [1] 14 77

排秩后取极值跟直接取极值的方法比较

not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r = min_rank(desc(dep_time))) %>% 
  filter(r %in% range(r))
#> # A tibble: 770 x 20
#> # Groups:   year, month, day [365]
#>    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
#> 1  2013     1     1      517            515         2      830            819
#> 2  2013     1     1     2356           2359        -3      425            437
#> 3  2013     1     2       42           2359        43      518            442
#> 4  2013     1     2     2354           2359        -5      413            437
#> 5  2013     1     3       32           2359        33      504            442
#> 6  2013     1     3     2349           2359       -10      434            445
#> # … with 764 more rows, and 12 more variables: arr_delay <dbl>, carrier <chr>,
#> #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#> #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, r <int>


not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )
#> # A tibble: 365 x 5
#> # Groups:   year, month [12]
#>    year month   day first_dep last_dep
#>   <int> <int> <int>     <int>    <int>
#> 1  2013     1     1       517     2356
#> 2  2013     1     2        42     2354
#> 3  2013     1     3        32     2349
#> 4  2013     1     4        25     2358
#> 5  2013     1     5        14     2357
#> 6  2013     1     6        16     2355
#> # … with 359 more rows

To count the number of distinct (unique) values, use n_distinct(x)

# Which destinations have the most carriers?
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))
#> # A tibble: 104 x 2
#>   dest  carriers
#>   <chr>    <int>
#> 1 ATL          7
#> 2 BOS          7
#> 3 CLT          7
#> 4 ORD          7
#> 5 TPA          7
#> 6 AUS          6
#> # … with 98 more rows

count()中可以加上权重wt,等价于group_by+summariz(sum)

new_flights %>% 
  filter(!is.na(arr_delay_real)) %>% 
  count(tailnum, wt=distance)

new_flights %>% 
  filter(!is.na(arr_delay_real)) %>% 
  group_by(tailnum) %>% 
  summarise(sum_distance = sum(distance))

# 二者结果一致,但count+权重的形成列名是n

Counts and proportions of logical values: sum(x > 10), mean(y == 0).
When used with numeric functions, TRUE is converted to 1 and FALSE to 0.
sum(x) gives the number of TRUEs in x, and mean(x) gives the proportion.

daily <- group_by(flights, year, month, day)

daily %>% 
  ungroup() %>%             # no longer grouped by date
  summarise(flights = n())  # all flights
#> # A tibble: 1 x 1
#>   flights
#>     <int>
#> 1  336776
# count()中sort参数默认是FALSE,即不排序,如果设置为TRUE则会按计数从大到小排序

new_flights %>% 
  filter(!is.na(dep_time)) %>% 
  count(dest, sort=F)
Tailnum <- new_flights %>% 
  group_by(tailnum) %>% 
  count() %>% 
  select(tailnum)

results <- new_flights %>% 
  filter(!is.na(arr_delay_real)) %>% 
  group_by(tailnum) %>% 
  arrange(year, month, day, hour, minute) %>% 
  mutate(cum_delay = arr_delay_real > 60,
         cumsum_delay = cumsum(cum_delay)) %>% 
  # 各组出现延误超过1h后累加就会变成1
  filter(cumsum_delay == 0) %>%
  count(tailnum) %>% 
  right_join(Tailnum, by='tailnum') 
  
# 将NA转换为0
results[is.na(results$n),2] <-0 ;results

6. Grouped mutates

分组过滤器filter()内排秩

# Find the worst members of each group

flights_sml %>% 
  group_by(year, month, day) %>%
  filter(rank(desc(arr_delay)) < 10)
#> # A tibble: 3,306 x 7
#> # Groups:   year, month, day [365]
#>    year month   day dep_delay arr_delay distance air_time
#>   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
#> 1  2013     1     1       853       851      184       41
#> 2  2013     1     1       290       338     1134      213
#> 3  2013     1     1       260       263      266       46
#> 4  2013     1     1       157       174      213       60
#> 5  2013     1     1       216       222      708      121
#> 6  2013     1     1       255       250      589      115
#> # … with 3,300 more rows

7. 其他用法

b <- data.frame(x1=c("A","B","D"),x3=c(T,F,T))
a <- data.frame(x1=c("A","B","C"),x2=c(1,2,3))
dplyr::inner_join(a,b,by="x1") # 内连接,交集
dplyr::full_join(a,b,by="x1") # 全连接,并集
dplyr::left_join(a,b,by="x1")
dplyr::right_join(a,b,by="x1")

> b
#>   x1    x3
#> 1  A  TRUE
#> 2  B FALSE
#> 3  D  TRUE
> a
#>   x1 x2
#> 1  A  1
#> 2  B  2
#> 3  C  3

# 半连接,交集后保留左边的列
dplyr::semi_join(a,b,by="x1") 
#>   x1 x2
#> 1  A  1
#> 2  B  2
#> Warning message:
#> Column `x1` joining factors with different levels, coercing to character vector 

# 补连接,左表扣除半连接
dplyr::anti_join(a,b,by="x1")
#>   x1 x2
#> 1  C  3
#> Warning message:
#> Column `x1` joining factors with different levels, coercing to character vector 
# 类似unique的用法
dplyr::distinct(rbind(iris[1:10,],iris[1:15,])) 

# 取出10-15行
dplyr::slice(iris,10:15) 
# 随机抽取10行
dplyr::sample_n(iris,10) 
# 按比例随机抽取
dplyr::sample_frac(iris,0.2) 
上一篇 下一篇

猜你喜欢

热点阅读