R for Data Science

[R语言] Functions 函数《R for data sc

2020-04-29  本文已影响0人  半为花间酒

《R for Data Science》第十九章 Functions 啃书知识点积累
参考链接:R for Data Science

When should you write a function?

“do not repeat yourself” (or DRY) principle
适用于超过两次的重复代码操作

df <- tibble::tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)

df$a <- (df$a - min(df$a, na.rm = TRUE)) / 
  (max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$b <- (df$b - min(df$b, na.rm = TRUE)) / 
  (max(df$b, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$c <- (df$c - min(df$c, na.rm = TRUE)) / 
  (max(df$c, na.rm = TRUE) - min(df$c, na.rm = TRUE))
df$d <- (df$d - min(df$d, na.rm = TRUE)) / 
  (max(df$d, na.rm = TRUE) - min(df$d, na.rm = TRUE))
range(c(1,3,5))
# [1] 1 5
range(c(11,32,25))
# [1] 11 32
# 上述代码可以简化为
rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}

df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)

# 实际上课本中的代码可以进一步简化为
apply(df, 2, rescale01)
rescale01(x)
# [1]   0   0   0   0   0   0   0   0   0   0 NaN

# 故修改函数体
rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE, finite = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}
rescale01(x)
# [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
# [8] 0.7777778 0.8888889 1.0000000       Inf
rescale1 <- function(x, na.rm) {
  rng <- range(x, na.rm = na.rm)
  (x - rng[1]) / (rng[2] - rng[1])
}
rescale1(c(NA, 1:5), na.rm = FALSE)
#> [1] NA NA NA NA NA NA
rescale1(c(NA, 1:5), na.rm = TRUE)
#> [1]   NA 0.00 0.25 0.50 0.75 1.00


# 加上finite后na.rm的设置不重要了
rescale2 <- function(x, na.rm) {
  rng <- range(x, na.rm = na.rm, finite = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}

rescale2(c(NA, 1:5), na.rm = FALSE)
#> [1]   NA 0.00 0.25 0.50 0.75 1.00
rescale2(c(NA, 1:5), na.rm = TRUE)
#> [1]   NA 0.00 0.25 0.50 0.75 1.00

原因:The option finite = TRUE to range() will drop all non-finite elements, and NA is a non-finite element.

另外可以注意,无限值可以相互比较

-Inf == -Inf
# [1] TRUE
# 判断path是否是文件夹
is_directory <- function(x) file.info(x)$isdir

# 判断文件是否可读(存在且有权限打开)
is_readable <- function(x) file.access(x, 4) == 0

- 函数设置关键字变量默认值即可空值调用

greet <- function(time = lubridate::now()) {
  hr <- lubridate::hour(time)
  # I don't know what to do about times after midnight,
  # are they evening or morning?
  if (hr < 12) {
    print("good morning")
  } else if (hr < 17) {
    print("good afternoon")
  } else {
    print("good evening")
  }
}

greet()

Functions are for humans and computers

ctrl + shift + R

# - -----------------------------------------------------------------------

Conditional execution

if (condition) {
  # code executed when condition is TRUE
} else {
  # code executed when condition is FALSE
}

- Conditions

The condition must evaluate to either TRUE or FALSE.
If it’s a vector, you’ll get a warning message; if it’s an NA, you’ll get an error.

if (c(TRUE, FALSE)) {}
#> Warning in if (c(TRUE, FALSE)) {: the condition has length > 1 and only the
#> first element will be used
#> NULL

if (NA) {}
#> Error in if (NA) {: missing value where TRUE/FALSE needed

用于逻辑表达式的判断,不是向量化操作符|&

As soon as || sees the first TRUE it returns TRUE without computing anything else. As soon as && sees the first FALSE it returns FALSE.

# identical是严格检测类型,返回单个逻辑判断
identical(0L, 0)
#> [1] FALSE

- Multiple conditions

if (this) {
  # do that
} else if (that) {
  # do something else
} else {
  # 
}
multi_op <- function(x, y, op) {
 switch(op,
   plus = x + y,
   minus = x - y,
   times = x * y,
   divide = x / y,
   stop("Unknown op!")
 )
}

switch的其他用法

# 如果是整数则按索引返回
switch(1, "apple", "banana", "cantaloupe")
#> [1] "apple"
switch(2, "apple", "banana", "cantaloupe")
#> [1] "banana"

# 如果是非整数则忽略小数部分
switch(1.2, "apple", "banana", "cantaloupe")
#> [1] "apple"
switch(2.8, "apple", "banana", "cantaloupe")
#> [1] "banana"

switch涉及缺失值和无表达式的情况

switcheroo <- function(x) {
  switch(x,
         a = ,
         b = "ab",
         c = NA,
         d = "cd"
  )
}

# a涉及的表达式为空,则轮次到下一个非空表达式共享给a
switcheroo("a")
#> [1] "ab"
switcheroo("b")
#> [1] "ab"
switcheroo("c")
#> [1] NA
switcheroo("d")
#> [1] "cd"
switcheroo("e")

- Code style

  1. An opening curly brace should never go on its own line and should always be followed by a new line.
  2. A closing curly brace should always go on its own line, unless it’s followed by else.
  3. Always indent the code inside curly braces.
  4. It’s ok to drop the curly braces if you have a very short if statement that can fit on one line.
y <- 10
x <- if (y < 20) "Too low" else "Too high"

Notice that when you call a function, you should place a space around = in function calls, and always put a space after a comma, not before (just like in regular English). Using whitespace makes it easier to skim the function for the important components.

# Good
average <- mean(feet / 12 + inches, na.rm = TRUE)

- “Fizz Buzz” 问题的几种解法

课本给出的这一解法个人认为不应该用短路&&
否则一旦!(x %% 3)为假时短路判断
下一个else if跟得依然是!(x %% 3),直接调转下一个
实际是多了一步无效代码

应该为:

fizzbuzz <- function(x) {
  # these two lines check that x is a valid input
  stopifnot(length(x) == 1)
  stopifnot(is.numeric(x))
  if (!(x %% 3) & !(x %% 5)) {
    "fizzbuzz"
  } else if (!(x %% 3)) {
    "fizz"
  } else if (!(x %% 5)) {
    "buzz"
  } else {
    # ensure that the function returns a character vector
    as.character(x)
  }
}


# 第二种解法用嵌套判断增加科学性
fizzbuzz2 <- function(x) {
  # these two lines check that x is a valid input
  stopifnot(length(x) == 1)
  stopifnot(is.numeric(x))
  if (!(x %% 3)) {
    if (!(x %% 5)) {
      "fizzbuzz"
    } else {
      "fizz"
    }
  } else if (!(x %% 5)) {
    "buzz"
  } else {
    # ensure that the function returns a character vector
    as.character(x)
  }
}


# 第三种解法是经典case_when,向量化
fizzbuzz_vec <- function(x) {
  case_when(
    !(x %% 3) & !(x %% 5) ~ "fizzbuzz",
    !(x %% 3) ~ "fizz",
    !(x %% 5) ~ "buzz",
    TRUE ~ as.character(x)
  )
}


# 第四种解法是利用向量完成向量化
fizzbuzz_vec2 <- function(x) {
  y <- as.character(x)
  # put the individual cases first - any elements divisible by both 3 and 5
  # will be overwritten with fizzbuzz later
  y[!(x %% 3)] <- "fizz"
  y[!(x %% 3)] <- "buzz"
  y[!(x %% 3) & !(x %% 5)] <- "fizzbuzz"
  y
}

- cut函数适用于有序判定

if (temp <= 0) {
  "freezing"
} else if (temp <= 10) {
  "cold"
} else if (temp <= 20) {
  "cool"
} else if (temp <= 30) {
  "warm"
} else {
  "hot"
}

# 可简化为(右闭)
temp <- seq(-10, 50, by = 5)
cut(temp, c(-Inf, 0, 10, 20, 30, Inf),
    right = TRUE,
    labels = c("freezing", "cold", "cool", "warm", "hot")
)
#>  [1] freezing freezing freezing cold     cold     cool     cool    
#>  [8] warm     warm     hot      hot      hot      hot     
#> Levels: freezing cold cool warm hot

# 也可以改成右开,即小于号
temp <- seq(-10, 50, by = 5)
cut(temp, c(-Inf, 0, 10, 20, 30, Inf),
    right = FALSE,
    labels = c("freezing", "cold", "cool", "warm", "hot")
)
#>  [1] freezing freezing cold     cold     cool     cool     warm    
#>  [8] warm     hot      hot      hot      hot      hot     
#> Levels: freezing cold cool warm hot

Function arguments

The default value should almost always be the most common value.
The few exceptions to this rule are to do with safety. For example, it makes sense for na.rm to default to FALSE because missing values are important. Even though na.rm = TRUE is what you usually put in your code, it’s a bad idea to silently ignore missing values by default.

- Choosing names

- Checking values

wt_mean <- function(x, w) {
  sum(x * w) / sum(w)
}
wt_var <- function(x, w) {
  mu <- wt_mean(x, w)
  sum(w * (x - mu) ^ 2) / sum(w)
}
wt_sd <- function(x, w) {
  sqrt(wt_var(x, w))
}

# 这里权重和数据长度不同,但由于循环补齐依然能运行
wt_mean(1:6, 1:3)
#> [1] 7.67

It’s good practice to check important preconditions, and throw an error (with stop())

wt_mean <- function(x, w) {
  if (length(x) != length(w)) {
    stop("`x` and `w` must be the same length", call. = FALSE)
  }
  sum(w * x) / sum(w)
}

# 如果进一步完善代码会变得更复杂
wt_mean <- function(x, w, na.rm = FALSE) {
  if (!is.logical(na.rm)) {
    stop("`na.rm` must be logical")
  }
  if (length(na.rm) != 1) {
    stop("`na.rm` must be length 1")
  }
  if (length(x) != length(w)) {
    stop("`x` and `w` must be the same length", call. = FALSE)
  }

  if (na.rm) {
    miss <- is.na(x) | is.na(w)
    x <- x[!miss]
    w <- w[!miss]
  }
  sum(w * x) / sum(w)
}


# 可以使用stopifnot函数,断言需要为真表达式
wt_mean <- function(x, w, na.rm = FALSE) {
  stopifnot(is.logical(na.rm), length(na.rm) == 1)
  stopifnot(length(x) == length(w))

  if (na.rm) {
    miss <- is.na(x) | is.na(w)
    x <- x[!miss]
    w <- w[!miss]
  }
  sum(w * x) / sum(w)
}
wt_mean(1:6, 6:1, na.rm = "foo")
#> Error in wt_mean(1:6, 6:1, na.rm = "foo"): is.logical(na.rm) is not TRUE

- Dot-dot-dot (…)

有些函数允许任意数量输入:

sum(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
#> [1] 55
stringr::str_c("a", "b", "c", "d", "e", "f")
#> [1] "abcdef"

They rely on a special argument: ... (pronounced dot-dot-dot). This special argument captures any number of arguments that aren’t otherwise matched.

commas <- function(...) stringr::str_c(..., collapse = ", ")
commas(letters[1:10])
#> [1] "a, b, c, d, e, f, g, h, i, j"

rule <- function(..., pad = "-") {
  title <- paste0(...)
  width <- getOption("width") - nchar(title) - 5
  cat(title, " ", stringr::str_dup(pad, width), "\n", sep = "")
}
rule("Important output")
#> Important output -----------------------------------------------------------

...的代价:关键字参数拼错不会报错,答案可能出错

sum(x, na.rm = TRUE)
# [1] 3
sum(x, na.mr = TRUE)
# [1] 4

Return values

complicated_function <- function(x, y, z) {
  if (length(x) == 0 || length(y) == 0) {
    return(0)
  }

  # Complicated code here
}

Environment

lexical scoping

f <- function(x) {
  x + y
} 

y <- 100
f(10)
#> [1] 110

y <- 1000
f(10)
#> [1] 1010
上一篇 下一篇

猜你喜欢

热点阅读