[R语言] lubridate包 时间处理《R for data
《R for Data Science》第十六章 Dates and times 啃书知识点积累
参考链接:R for Data Science
Dates and times are hard because they have to reconcile two physical phenomena
(the rotation of the Earth and its orbit around the sun)
目前已经把R更新到4.0,安装
nycflights13出了些麻烦,最后用以下命令
install.packages("https://cran.r-project.org/src/contrib/nycflights13_1.0.1.tar.gz",
repos=NULL, method="libcurl")
Creating date/times
A date-time is a date plus a time: it uniquely identifies an instant in time.
Tibbles print this as<dttm>. Elsewhere in R these are calledPOSIXct
library(lubridate)
today()
#> [1] "2020-04-27"
now()
#> [1] "22020-04-27 10:11:40 CST"
# today()中的tzone参数控制时区
?today()
today("GMT")
today("UTC")
- From strings
ymd("2020-04-27")
#> [1] "2020-04-27"
mdy("April 27st, 2020")
#> [1] "2020-04-27"
dmy("27-Apr-2020")
#> [1] "2020-04-27"
mdy("Apr-27-2020")
#> [1] "2020-04-27"
# dttm格式也可以
ymd_hms("2020-04-27 20:11:59")
#> [1] "2020-04-27 20:11:59 UTC"
mdy_hm("04/27/2020 08:01")
#> [1] "2020-04-27 08:01:00 UTC"
# 如果向量中含有无效字符串
ymd(c('20200427','XiChen'))
# [1] "2020-04-27" NA
# Warning message:
# 1 failed to parse.
- These functions also take unquoted numbers
ymd(20200427)
#> [1] "2020-04-27"
ymd(20200427, tz = "UTC")
#> [1] "2020-04-27 UTC"
- From individual components
To create a date/time from this sort of input, use
make_date()for dates, ormake_datetime()for date-times
library(tidyverse)
library(nycflights13)
library(lubridate)
flights %>%
select(year, month, day, hour, minute) %>%
mutate(departure = make_datetime(year, month, day, hour, minute))
#> # A tibble: 336,776 x 6
#> year month day hour minute departure
#> <int> <int> <int> <dbl> <dbl> <dttm>
#> 1 2013 1 1 5 15 2013-01-01 05:15:00
#> 2 2013 1 1 5 29 2013-01-01 05:29:00
#> 3 2013 1 1 5 40 2013-01-01 05:40:00
#> 4 2013 1 1 5 45 2013-01-01 05:45:00
#> 5 2013 1 1 6 0 2013-01-01 06:00:00
#> 6 2013 1 1 5 58 2013-01-01 05:58:00
#> # … with 3.368e+05 more rows
# 也可以用make_date不包含time
flights %>%
select(year, month, day, hour, minute) %>%
mutate(date = make_date(year, month))
- 自建函数配合
make_解析日期时间
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
- From other types
-
date和dttm互换
as_datetime(today())
#> [1] "2020-01-15 UTC"
as_date(now())
#> [1] "2020-01-15"
- “Unix Epoch” 基于1970-01-01
as_datetime(60 * 60 * 10)
#> [1] "1970-01-01 10:00:00 UTC"
# 中间有两个闰年
as_date(365 * 10 + 2)
#> [1] "1980-01-01"
Date-time components
datetime <- ymd_hms("2016-07-08 12:34:56")
year(datetime)
#> [1] 2016
month(datetime)
#> [1] 7
mday(datetime)
#> [1] 8
yday(datetime)
#> [1] 190
wday(datetime)
#> [1] 6
- 可以设置具体参数优化提取
month(datetime, label = TRUE)
#> [1] Jul
#> 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(datetime, label = TRUE, abbr = FALSE)
#> [1] Friday
#> 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
# wday的label参数关系到映射的文本
p1 <- flights_dt %>%
mutate(wday = wday(dep_time)) %>%
ggplot(aes(x = wday)) +
geom_bar()
p2 <- flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
ggplot(aes(x = wday)) +
geom_bar()
p1 + p2
- 一个类似于“幸存者偏差”的案例
# 实际起飞
p1 <- flights_dt %>%
mutate(minute = minute(dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
ggplot() +
geom_line(aes(minute, avg_delay))
# 计划起飞
p2 <- flights_dt %>%
mutate(minute = minute(sched_dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()) %>%
ggplot() +
geom_line(aes(minute, avg_delay))
p1 + p2
- Rounding
将时间归并到近似单元
floor_date()round_date()-
ceiling_date()
(需要指定unit)
floor_date(today(), unit = "year") + months(0:11)
#> [1] "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" "2019-05-01"
#> [6] "2019-06-01" "2019-07-01" "2019-08-01" "2019-09-01" "2019-10-01"
#> [11] "2019-11-01" "2019-12-01"
flights_dt %>%
count(week = floor_date(dep_time, "week")) %>%
ggplot(aes(week, n)) +
geom_line()
- Setting components
# 可以直接逐个设置
(datetime <- ymd_hms("2016-07-08 12:34:56"))
#> [1] "2016-07-08 12:34:56 UTC"
year(datetime) <- 2020
datetime
#> [1] "2020-07-08 12:34:56 UTC"
month(datetime) <- 01
datetime
#> [1] "2020-01-08 12:34:56 UTC"
hour(datetime) <- hour(datetime) + 1
datetime
#> [1] "2020-01-08 13:34:56 UTC"
# 也可以用update更新
update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
#> [1] "2020-02-02 02:34:56 UTC"
# 值过大会滚动累加
ymd("2015-02-01") %>%
update(mday = 30)
#> [1] "2015-03-02"
ymd("2015-02-01") %>%
update(hour = 400)
#> [1] "2015-02-17 16:00:00 UTC"
- Q: How does the distribution of flight times within a day change over the course of the year?
flights_dt %>%
filter(!is.na(dep_time)) %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
mutate(month = factor(month(dep_time))) %>%
ggplot(aes(dep_hour, color = month)) +
geom_freqpoly(aes(y = ..density..), binwidth = 60 * 60)
Time spans
- Durations
Durations always record the time span in seconds.
c_age <- today() - ymd(19941027)
c_age
# Time difference of 9314 days
as.duration(c_age) # 先转换为s
# [1] "804729600s (~25.5 years)"
dseconds(15)
#> [1] "15s"
dminutes(10)
#> [1] "600s (~10 minutes)"
dhours(c(12, 24))
#> [1] "43200s (~12 hours)" "86400s (~1 days)"
ddays(0:5)
#> [1] "0s" "86400s (~1 days)" "172800s (~2 days)"
#> [4] "259200s (~3 days)" "345600s (~4 days)" "432000s (~5 days)"
dweeks(3)
#> [1] "1814400s (~3 weeks)"
dyears(1)
#> [1] "31536000s (~52.14 weeks)"
# 可以做计算
2 * dyears(1)
#> [1] "63072000s (~2 years)"
dyears(1) + dweeks(12) + dhours(15)
#> [1] "38847600s (~1.23 years)"
tomorrow <- today() + ddays(1);tomorrow
# [1] "2020-04-28"
last_year <- today() - dyears(1)
# [1] "2020-04-26"
- Periods
Periods are time spans but don’t have a fixed length in seconds, instead they work with “human” times, like days and months.
# Durations无法根据时区调整
one_pm <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")
one_pm
#> [1] "2016-03-12 13:00:00 EST"
one_pm + ddays(1)
#> [1] "2016-03-13 14:00:00 EDT"
# Periods可以
one_pm
#> [1] "2016-03-12 13:00:00 EST"
one_pm + days(1)
#> [1] "2016-03-13 13:00:00 EDT"
# 另一个例子
# A leap year
ymd("2016-01-01") + dyears(1)
#> [1] "2016-12-31"
ymd("2016-01-01") + years(1)
#> [1] "2017-01-01"
- 多数时候periods和durations用法类似
但解析的是“human units”而不是durations中的秒
seconds(15)
#> [1] "15S"
minutes(10)
#> [1] "10M 0S"
hours(c(12, 24))
#> [1] "12H 0M 0S" "24H 0M 0S"
days(7)
#> [1] "7d 0H 0M 0S"
months(1:6)
#> [1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
#> [5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
weeks(3)
#> [1] "21d 0H 0M 0S"
years(1)
#> [1] "1y 0m 0d 0H 0M 0S"
# 也可以做计算
10 * (months(6) + days(1))
#> [1] "60m 10d 0H 0M 0S"
days(50) + hours(25) + minutes(2)
#> [1] "50d 25H 2M 0S"
- Intervals
涉及的符号:
%--%
years(1) / days(1)
# [1] 365.25
next_year <- today() + years(1)
(today() %--% next_year) / ddays(1)
# [1] 365
(today() %--% next_year) %/% days(1)
# [1] 365
- Summary
If you only care about physical time, use a duration;
if you need to add human times, use a period;
if you need to figure out how long a span is in human units, use an interval.
Time zones
用的少,就放两个可能用到的代码
Sys.timezone()
# [1] "Asia/Taipei"
ymd_hms("2020-04-27 12:00:00", tz = Sys.timezone())
# [1] "2020-04-27 12:00:00 CST"