数据-R语言-图表-决策-Linux-Python

R数据科学(三)dplyr

2018-11-17  本文已影响3人  子鹿学生信

CHAPTER 3 Data Transformation with dplyr

library(nycflights13)
library(tidyverse)

查看冲突信息发现dplyr与基本包的函数冲突,如想用基本包的函数,可以这样写:stats::filter(),stats::lag()。
本次演示数据为nycflights13::flights,包括336,776 flights that departed from New York City in 2013,数据来自US Bureau of Transportation Statistics。
查看具体信息:

?flights
class(flights)
dim(flights)
head(flights)
View(flights)

可以发现该数据是一个Tibbles,属于数据框,但是在tidyverse里更方便。

dplyr包的核心函数:

Filter Rows with filter()

filter(flights, month == 1, day == 1)
# 如果想保存结果,需要另外赋值给一个变量
jan1 <- filter(flights, month == 1, day == 1)
# 如果想保存同时打印,需要在外层用括号包裹起来。
(jan1 <- filter(flights, month == 1, day == 1))

Comparisons

比较运算符:>, >=, <, <=, != (not equal), and == (equal)。
几个易犯的错误:

sqrt(2) ^ 2 == 2
#> [1] FALSE
1/49 * 49 == 1
#> [1] FALSE
near(sqrt(2) ^ 2, 2)
#> [1] TRUE
near(1 / 49 * 49, 1)
#> [1] TRUE

Logical Operators

或 | ,与 & ,非 !

# 找出11月或12月出发的航班
filter(flights, month == 11 | month == 12)

用管道符简化选择 x%in%y,指x是y中的一个

(nov_dec <- filter(flights, month %in% c(11,12)))

其他简化操作:!(x & y)等价于!x | !y,!(x | y)等价于!x & !y

# 这两个结果相同
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)

Missing Values 缺失值

缺失值用NAs表示 (“not availables”)
NULL表示空值,无

NA > 5
#> [1] NA
10 == NA
#> [1] NA
NA + 10
#> [1] NA

NA == NA
#> [1] NA

is.na()检查是否是缺失值
注意filter只选择TRUE的值,FALSE 和 NA 的会排除掉,
如果想保留缺失值

df <- tibble(x=c(1,NA,3))
filter(df,x > 1)
#> # A tibble: 1 × 1
#> x
#> <dbl>
#> 1 3
filter(df, is.na(x) | x > 1)
#> # A tibble: 2 × 1
#> x
#> <dbl>
#> 1 NA
#> 2 3

练习题:
1.a选择延误到达大于等于两个小时的航班

View(flights)
filter(flights, arr_delay >= 120)

b.The flights that flew to Houston were are those flights where the destination (dest) is either “IAH” or “HOU”.

filter(flights,dest=='IAH' | dest=='HOU')
# 或者用%in% 选择
filter(flights,dest %in% c('IAH','HOU'))

c. Were operated by United, American, or Delta

filter(flights, carrier %in% c("AA", "DL", "UA"))

d.Departed in summer (July, August, and September)

filter(flights, month >= 7, month <= 9)

e. Arrived more than two hours late, but didn’t leave late

filter(flights, dep_delay <= 0, arr_delay > 120)

f. Were delayed by at least an hour, but made up over 30 minutes in flight

filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)

g. Departed between midnight and 6 a.m. (inclusive)

filter(flights, dep_time <= 600 | dep_time == 2400)
  1. between()的作用
    between(x, left, right) 与 x >= left & x <= right 相同
filter(flights, between(month, 7, 9))
# month >= 7 & month <= 9
  1. missing dep_time 缺失值
filter(flights, is.na(dep_time))
  1. Why is NA ^ 0 not missing? Why is NA | TRUE not missing?
    Why is FALSE & NA not missing? Can you figure out the general
    rule? (NA * 0 is a tricky counterexample!)
NA ^ 0
#> [1] 1
NA | TRUE
#> [1] TRUE
NA & FALSE
#> [1] FALSE
NA | FALSE
#> [1] NA
NA & TRUE
#> [1] NA
NA * 0
#> [1] NA
Inf * 0
#> [1] NaN
-Inf * 0
#> [1] NaN

Arrange Rows with arrange() 对列排序

arrange(flights, year, month, day)
arrange(flights, desc(arr_delay))
df <- tibble(x = c(5, 2, NA))
arrange(df, x) #缺失值排到最后
arrange(df, desc(x))

Select Columns with select() 按列选择

select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
select(flights, ends_with("y"))

rename() 重命名变量

rename(flights, tail_num = tailnum)
select(flights, time_hour, air_time, everything())

练习题:
Exercise 5.4.1.1 Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.

select(flights, dep_time, dep_delay, arr_time, arr_delay)
select(flights, "dep_time", "dep_delay", "arr_time", "arr_delay")
select(flights, 4, 5, 6, 9)
select(flights, one_of(c("dep_time", "dep_delay", "arr_time", "arr_delay")))

variables <- c("dep_time", "dep_delay", "arr_time", "arr_delay")
select(flights, one_of(variables))
select(flights, starts_with("dep_"), starts_with("arr_"))
select(flights, matches("^(dep|arr)_(time|delay)$"))
select(flights, ends_with("arr_time"), ends_with("dep_time"))
select(flights, contains("_time"), contains("arr_"))

Exercise 5.4.1.2 What happens if you include the name of a variable multiple times in a select() call?

# select忽略重复项,只选第一个。
select(flights, year, month, day, year, year)
# everything
select(flights, arr_delay, everything())

Exercise 5.4.1.3 What does the one_of() function do? Why might it be helpful in conjunction with this vector?

# one_of 可以将一个向量传入
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, one_of(vars))

Exercise 5.4.1.4 Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default?

select(flights, contains("TIME"))
# contains忽略了大小写,有个参数可以改变
select(flights, contains("TIME", ignore.case = FALSE))

Add New Variables with mutate() 添加新列

flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)

mutate(flights_sml,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60)

mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
# transmute
transmute(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)

该函数支持输入函数,注意输入和输出必须为向量。常用创建函数运算为:

(x <- 1:10)
lead(x)
lag(x)
cumsum(x)
cummean(x)

5.6 Grouped summaries with summarise() 折叠数据框,一般与group_by()连用

by_day <- group_by(flights, year, month, day) # 设置分组
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE)) #summarize设置函数

Combining Multiple Operations with the Pipe

head(flights)
# 1. 对dest进行分组
by_dest <- group_by(flights, dest)
# 2.计算距离,平均延误时间,飞机数量
delay <- summarize(by_dest,
count = n(),dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
# 3.对数据进行过滤
delay <- filter(delay, count > 20, dest != "HNL")

ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)

# 用管道符 %>% 连接前后数据
delays <- flights %>%
group_by(dest) %>%
summarize(
count = n(),dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")

Missing Values 的处理 na.rm 参数

flights %>%
group_by(year, month, day) %>%
summarize(mean = mean(dep_delay)) # 如果有一个NA,那么结果就为NA,需要先去掉

flights %>%
group_by(year, month, day) %>%
summarize(mean = mean(dep_delay, na.rm = TRUE))

# 可以先把数据中的空值去掉
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarize(mean = mean(dep_delay))

Counts

delays <- not_cancelled %>%
  group_by(tailnum) %>%
  summarise(delay=mean(arr_delay))

ggplot(delays,aes(delay))+geom_freqpoly(binwidth=10)


delays <- not_cancelled %>%
group_by(tailnum) %>%
summarize(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)

delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)

library(Lahman)
batting <- as_tibble(Lahman::Batting)

batters <- batting %>% group_by(playerID) %>% 
  summarize(ba=sum(H,na.rm=TRUE)/sum(AB,na.rm = TRUE),
            ab = sum(AB,na.rm = TRUE))

batters %>% filter(ab>100) %>% ggplot(aes(ab,ba)) + geom_point() +geom_smooth(se=F)

batters %>% arrange(desc(ba))
# 常用的分组函数:mean(x),median(x)
not_cancelled %>% group_by(year,month,day) %>% summarize(
  avg_delay1 = mean(arr_delay),
  avg_delay2 = mean(arr_delay[arr_delay > 0])
)
# sd(x), 四分位数IQR(x), 中位值偏差mad(x)
not_cancelled %>%
group_by(dest) %>%
summarize(distance_sd = sd(distance)) %>%
arrange(desc(distance_sd))

# min(x), quantile(x, 0.25), max(x)
# When do the first and last flights leave each day?
not_cancelled %>%
group_by(year, month, day) %>%
summarize(
first = min(dep_time),
last = max(dep_time)
)

not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))
# 计算非空值sum(!is.na(x)),计算唯一值:n_distinct(x)
# Which destinations have the most carriers?
not_cancelled %>%
group_by(dest) %>%
summarize(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))

# 计数:n(),和count()
not_cancelled %>%
count(dest)

not_cancelled %>%
count(tailnum, wt = distance)

# How many flights left before 5am? (these usually
# indicate delayed flights from the previous day)
not_cancelled %>%
group_by(year, month, day) %>%
summarize(n_early = sum(dep_time < 500)) 
# 也可以对逻辑值进行计数:如sum(x>10)代表数多少个TRUE,mean(x)计算其比例。
not_cancelled %>%
group_by(year, month, day) %>%
summarize(hour_perc = mean(arr_delay > 60))

按多个变量分组

daily <- group_by(flights, year, month, day)
(per_day <- summarize(daily, flights = n()))
(per_month <- summarize(per_day, flights = sum(flights)))
(per_year <- summarize(per_month, flights = sum(flights)))
flights %>% group_by(day) %>% summarize(mean(dep_time,na.rm = T))

# ungrouping 取消分组
daily %>%
ungroup() %>% # no longer grouped by date
summarize(flights = n()) # all flights

3.6.7 练习

(2) 找出另外一种方法,这种方法要可以给出与 not_cancelled %>% count(dest) 和 not_
cancelled %>% count(tailnum, wt = distance) 同样的输出(不能使用 count())。

not_canceled <- flights %>%
  filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>% count(dest)
not_cancelled %>% count(tailnum, wt = distance)
# 可以先分组再求每组的长度。
not_cancelled %>%
  group_by(dest) %>%
  summarise(n = length(dest))

not_cancelled %>%
  group_by(dest) %>%
  summarise(n = n())

not_cancelled %>%
  group_by(tailnum) %>%
  summarise(n = sum(distance))

3.7 分组新变量(和筛选器)

# 找出每个分组中最差的成员
flights_sml %>% group_by(year,month,day) %>% filter(rank(desc(arr_delay))<10)

#找出大于某个阈值的所有分组:
popular_dests <- flights %>% group_by(dest) %>% filter(n()>365)
popular_dests

#对数据进行标准化以计算分组指标
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)
head(flights)
filter(flights,origin %>% c('IAH'))

阅读推荐:
生信技能树公益视频合辑:学习顺序是linux,r,软件安装,geo,小技巧,ngs组学!
B站链接:https://m.bilibili.com/space/338686099
YouTube链接:https://m.youtube.com/channel/UC67sImqK7V8tSWHMG8azIVA/playlists
生信工程师入门最佳指南:https://mp.weixin.qq.com/s/vaX4ttaLIa19MefD86WfUA

上一篇下一篇

猜你喜欢

热点阅读