R for Data Science

[R语言] lubridate包 时间处理《R for data

2020-04-27  本文已影响0人  半为花间酒

《R for Data Science》第十六章 Dates and times 啃书知识点积累
参考链接:R for Data Science

Dates and times are hard because they have to reconcile two physical phenomena
(the rotation of the Earth and its orbit around the sun)

目前已经把R更新到4.0,安装nycflights13出了些麻烦,最后用以下命令

install.packages("https://cran.r-project.org/src/contrib/nycflights13_1.0.1.tar.gz", 
                  repos=NULL, method="libcurl")

Creating date/times

A date-time is a date plus a time: it uniquely identifies an instant in time.
Tibbles print this as <dttm>. Elsewhere in R these are called POSIXct

library(lubridate)
today()
#> [1] "2020-04-27"
now()
#> [1] "22020-04-27 10:11:40 CST"

# today()中的tzone参数控制时区
?today()
today("GMT")
today("UTC")

- From strings

ymd("2020-04-27")
#> [1] "2020-04-27"
mdy("April 27st, 2020")
#> [1] "2020-04-27"
dmy("27-Apr-2020")
#> [1] "2020-04-27"
mdy("Apr-27-2020")
#> [1] "2020-04-27"

# dttm格式也可以
ymd_hms("2020-04-27 20:11:59")
#> [1] "2020-04-27 20:11:59 UTC"
mdy_hm("04/27/2020 08:01")
#> [1] "2020-04-27 08:01:00 UTC"

# 如果向量中含有无效字符串
ymd(c('20200427','XiChen'))
# [1] "2020-04-27" NA          
# Warning message:
# 1 failed to parse. 
ymd(20200427)
#> [1] "2020-04-27"
ymd(20200427, tz = "UTC")
#> [1] "2020-04-27 UTC"

- From individual components

To create a date/time from this sort of input, use make_date() for dates, or make_datetime() for date-times

library(tidyverse)
library(nycflights13)
library(lubridate)

flights %>% 
  select(year, month, day, hour, minute) %>% 
  mutate(departure = make_datetime(year, month, day, hour, minute))
#> # A tibble: 336,776 x 6
#>    year month   day  hour minute departure          
#>   <int> <int> <int> <dbl>  <dbl> <dttm>             
#> 1  2013     1     1     5     15 2013-01-01 05:15:00
#> 2  2013     1     1     5     29 2013-01-01 05:29:00
#> 3  2013     1     1     5     40 2013-01-01 05:40:00
#> 4  2013     1     1     5     45 2013-01-01 05:45:00
#> 5  2013     1     1     6      0 2013-01-01 06:00:00
#> 6  2013     1     1     5     58 2013-01-01 05:58:00
#> # … with 3.368e+05 more rows

# 也可以用make_date不包含time
flights %>% 
  select(year, month, day, hour, minute) %>% 
  mutate(date = make_date(year, month))
make_datetime_100 <- function(year, month, day, time) {
  make_datetime(year, month, day, time %/% 100, time %% 100)
}

flights_dt <- flights %>% 
  filter(!is.na(dep_time), !is.na(arr_time)) %>% 
  mutate(
    dep_time = make_datetime_100(year, month, day, dep_time),
    arr_time = make_datetime_100(year, month, day, arr_time),
    sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
    sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
  ) %>% 
  select(origin, dest, ends_with("delay"), ends_with("time"))

- From other types

as_datetime(today())
#> [1] "2020-01-15 UTC"
as_date(now())
#> [1] "2020-01-15"
as_datetime(60 * 60 * 10)
#> [1] "1970-01-01 10:00:00 UTC"

# 中间有两个闰年
as_date(365 * 10 + 2)
#> [1] "1980-01-01"

Date-time components

datetime <- ymd_hms("2016-07-08 12:34:56")

year(datetime)
#> [1] 2016
month(datetime)
#> [1] 7
mday(datetime)
#> [1] 8

yday(datetime)
#> [1] 190
wday(datetime)
#> [1] 6
month(datetime, label = TRUE)
#> [1] Jul
#> 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec

wday(datetime, label = TRUE, abbr = FALSE)
#> [1] Friday
#> 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday


# wday的label参数关系到映射的文本
p1 <- flights_dt %>% 
  mutate(wday = wday(dep_time)) %>% 
  ggplot(aes(x = wday)) +
  geom_bar()

p2 <- flights_dt %>% 
  mutate(wday = wday(dep_time, label = TRUE)) %>% 
  ggplot(aes(x = wday)) +
  geom_bar()

p1 + p2
# 实际起飞
p1 <- flights_dt %>% 
  mutate(minute = minute(dep_time)) %>% 
  group_by(minute) %>% 
  summarise(
    avg_delay = mean(arr_delay, na.rm = TRUE),
    n = n()) %>% 
  ggplot() +
  geom_line(aes(minute, avg_delay))

# 计划起飞
p2 <- flights_dt %>% 
  mutate(minute = minute(sched_dep_time)) %>% 
  group_by(minute) %>% 
  summarise(
    avg_delay = mean(arr_delay, na.rm = TRUE),
    n = n()) %>% 
  ggplot() +
  geom_line(aes(minute, avg_delay))

p1 + p2

- Rounding

将时间归并到近似单元

floor_date(today(), unit = "year") + months(0:11)
#>  [1] "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" "2019-05-01"
#>  [6] "2019-06-01" "2019-07-01" "2019-08-01" "2019-09-01" "2019-10-01"
#> [11] "2019-11-01" "2019-12-01"
flights_dt %>% 
  count(week = floor_date(dep_time, "week")) %>% 
  ggplot(aes(week, n)) +
    geom_line()

- Setting components

# 可以直接逐个设置
(datetime <- ymd_hms("2016-07-08 12:34:56"))
#> [1] "2016-07-08 12:34:56 UTC"

year(datetime) <- 2020
datetime
#> [1] "2020-07-08 12:34:56 UTC"
month(datetime) <- 01
datetime
#> [1] "2020-01-08 12:34:56 UTC"
hour(datetime) <- hour(datetime) + 1
datetime
#> [1] "2020-01-08 13:34:56 UTC"

# 也可以用update更新
update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
#> [1] "2020-02-02 02:34:56 UTC"

# 值过大会滚动累加
ymd("2015-02-01") %>% 
  update(mday = 30)
#> [1] "2015-03-02"
ymd("2015-02-01") %>% 
  update(hour = 400)
#> [1] "2015-02-17 16:00:00 UTC"
flights_dt %>%
  filter(!is.na(dep_time)) %>%
  mutate(dep_hour = update(dep_time, yday = 1)) %>%
  mutate(month = factor(month(dep_time))) %>%
  ggplot(aes(dep_hour, color = month)) +
  geom_freqpoly(aes(y = ..density..), binwidth = 60 * 60)

Time spans

- Durations

Durations always record the time span in seconds.

c_age <- today() - ymd(19941027)
c_age
# Time difference of 9314 days

as.duration(c_age) # 先转换为s
# [1] "804729600s (~25.5 years)"

dseconds(15)
#> [1] "15s"
dminutes(10)
#> [1] "600s (~10 minutes)"
dhours(c(12, 24))
#> [1] "43200s (~12 hours)" "86400s (~1 days)"
ddays(0:5)
#> [1] "0s"                "86400s (~1 days)"  "172800s (~2 days)"
#> [4] "259200s (~3 days)" "345600s (~4 days)" "432000s (~5 days)"
dweeks(3)
#> [1] "1814400s (~3 weeks)"
dyears(1)
#> [1] "31536000s (~52.14 weeks)"

# 可以做计算
2 * dyears(1)
#> [1] "63072000s (~2 years)"
dyears(1) + dweeks(12) + dhours(15)
#> [1] "38847600s (~1.23 years)"

tomorrow <- today() + ddays(1);tomorrow
# [1] "2020-04-28"
last_year <- today() - dyears(1)
# [1] "2020-04-26"

- Periods

Periods are time spans but don’t have a fixed length in seconds, instead they work with “human” times, like days and months.

# Durations无法根据时区调整
one_pm <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")

one_pm
#> [1] "2016-03-12 13:00:00 EST"
one_pm + ddays(1)
#> [1] "2016-03-13 14:00:00 EDT"

# Periods可以
one_pm
#> [1] "2016-03-12 13:00:00 EST"
one_pm + days(1)
#> [1] "2016-03-13 13:00:00 EDT"

# 另一个例子
# A leap year
ymd("2016-01-01") + dyears(1)
#> [1] "2016-12-31"
ymd("2016-01-01") + years(1)
#> [1] "2017-01-01"
seconds(15)
#> [1] "15S"
minutes(10)
#> [1] "10M 0S"
hours(c(12, 24))
#> [1] "12H 0M 0S" "24H 0M 0S"
days(7)
#> [1] "7d 0H 0M 0S"
months(1:6)
#> [1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
#> [5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
weeks(3)
#> [1] "21d 0H 0M 0S"
years(1)
#> [1] "1y 0m 0d 0H 0M 0S"

# 也可以做计算
10 * (months(6) + days(1))
#> [1] "60m 10d 0H 0M 0S"
days(50) + hours(25) + minutes(2)
#> [1] "50d 25H 2M 0S"

- Intervals

涉及的符号:%--%

years(1) / days(1)
# [1] 365.25

next_year <- today() + years(1)
(today() %--% next_year) / ddays(1)
# [1] 365

(today() %--% next_year) %/% days(1)
# [1] 365

- Summary

If you only care about physical time, use a duration;
if you need to add human times, use a period;
if you need to figure out how long a span is in human units, use an interval.

Time zones

用的少,就放两个可能用到的代码

Sys.timezone()
# [1] "Asia/Taipei"

ymd_hms("2020-04-27 12:00:00", tz = Sys.timezone())
# [1] "2020-04-27 12:00:00 CST"
上一篇下一篇

猜你喜欢

热点阅读