数据科学与R语言

10. 非标准计算

2020-04-28  本文已影响0人  kkkkkkang

表达式获取substitute(),它一般和deparse()一起使用,因为deparse()以substitute()的结果为参数,并把它转换成字符向量

#substitute()返回的是一个表达式
> f <- function(x) {
+     substitute(x)
+ }
> y <- 13
> f(x + y^2)
x + y^2
#deparse()返回的是一个字符向量,就是用""括起来了
> g <- function(x) deparse(substitute(x))
> g(1:10)
[1] "1:10"
> #> [1] "1:10"
> g(x)
[1] "x"
> #> [1] "x"
> g(x + y^2)
[1] "x + y^2"
#这个很常用,比如:
library(ggplot2)
# 等于
library("ggplot2")
#比如:
> g <- function(x) deparse(substitute(x))
> g(a + b + c + d + e + f + g + h + i + j + k + l + m +
+       +       n + o + p + q + r + s + t + u + v + w + x + y + z)
[1] "a + b + c + d + e + f + g + h + i + j + k + l + m + +n + o + "
[2] "    p + q + r + s + t + u + v + w + x + y + z"  
#怎么让它输出一个字符串?
#deparse()中有个参数——width.cutoff,取值[20:500],默认60L。其中数值大小代表的字节数。一个英文字母、符号和空格都代表一个字节。
#所以算一下,上面那一长串共有101字节,设置width.cutoff=101L就行了
> nchar("a + b + c + d + e + f + g + h + i + j + k + l + m + n + o + p + q + r + s + t + u + v + w + x + y + z")
[1] 101
> g <- function(x) deparse(substitute(x),width.cutoff = 101L)
> g(a + b + c + d + e + f + g + h + i + j + k + l + m +
+       +       n + o + p + q + r + s + t + u + v + w + x + y + z)
[1] "a + b + c + d + e + f + g + h + i + j + k + l + m + +n + o + p + q + r + s + t + u + v + w + x + y + z"
#但是,我改为width.cutoff=100L也是同样的输出。改为width.cutoff=99L,就是这样了。。。
> g <- function(x) deparse(substitute(x),width.cutoff = 99L)
> g(a + b + c + d + e + f + g + h + i + j + k + l + m +
+       +       n + o + p + q + r + s + t + u + v + w + x + y + z)
[1] "a + b + c + d + e + f + g + h + i + j + k + l + m + +n + o + p + q + r + s + t + u + v + w + x + y + "
[2] "    z"
#懂的小伙伴,望指教
#来先看看直接使用f得到的结果是什么
> f <- function(x) substitute(x)
> g <- function(x) deparse(f(x))
> g(1:10)
[1] "x"
> g(x)
[1] "x"
> g(x + y ^ 2 / z + exp(a * sin(b)))
[1] "x"
#其实很简单g在定义的时候f(x)已经有值了——“x”
#我决定这样实现我们想要的功能——直接利用f实现g <- function(x) deparse(substitute(x))同等功能的函数
> f <- function(...) substitute(...)
> g <- function(...) deparse(f(...))
> g(1:10)
[1] "1:10"
> g(x)
[1] "x"
> g(x + y ^ 2 / z + exp(a * sin(b)))
[1] "x + y^2/z + exp(a * sin(b))"

子集选取中的非标准计算:这部分主要是介绍eval()和quote()这俩函数

#先看subset()的魔力在哪里
sample_df <- data.frame(a = 1:5, b = 5:1, c = c(5, 3, 1, 4, 1))

subset(sample_df, a >= 4)
#>   a b c
#> 4 4 2 4
#> 5 5 1 1
# equivalent to:
# sample_df[sample_df$a >= 4, ]

subset(sample_df, b == c)
#>   a b c
#> 1 1 5 5
#> 5 5 1 1
# equivalent to:
# sample_df[sample_df$b == sample_df$c, ]

#quote()登场——它其实和deparse(substitute())功能一样,除了返回的结果类型不一样,但这不影响计算使用
quote(1:10)
#> 1:10
quote(x)
#> x
quote(x + y^2)
#> x + y^2

#接下来到eval()了,它计算表达式的值。它和quote()相反,每一层eval()剥去一层quote()
eval(quote(x <- 1))
eval(quote(x))
#> [1] 1

eval(quote(y))
#> Error in eval(quote(y)): object 'y' not found
quote(2 + 2)
#> 2 + 2
eval(quote(2 + 2))
#> [1] 4

quote(quote(2 + 2))
#> quote(2 + 2)
eval(quote(quote(2 + 2)))
#> 2 + 2
eval(eval(quote(quote(2 + 2))))
#> [1] 4

#eval()第二个参数是环境
eval(quote(x), list(x = 30))
#> [1] 30
eval(quote(x), data.frame(x = 40))
#> [1] 40
eval(quote(a >= 4), sample_df)
#> [1] FALSE FALSE FALSE  TRUE  TRUE
eval(quote(b == c), sample_df)
#> [1]  TRUE FALSE FALSE FALSE  TRUE
subset2 <- function(x, condition) {
  condition_call <- substitute(condition)
  r <- eval(condition_call, x)
  x[r, ]
}
subset2(sample_df, a >= 4)
#>   a b c
#> 4 4 2 4
#> 5 5 1 1
#这,就是,,,,答案
eval(quote(eval(quote(eval(quote(2 + 2))))))
#原先定义的subset2
> subset2 <- function(x, condition) {
+     condition_call <- substitute(condition)
+     r <- eval(condition_call, x)
+     x[r, ]
+ }
> sample_df2 <- data.frame(x = 1:10)
> subset2(sample_df2, x > 8)
[1]  9 10
> class(subset2(sample_df2, x > 8))
[1] "integer"
#返回结果是整数类型,哈!
#那,我自己改一下呗
subset2 <- function(x, condition) {
  condition_call <- substitute(condition)
  r <- eval(condition_call, x)
  if (ncol(x)==1){
    s <- data.frame(x[r,])
    colnames(s) <- colnames(x)
    s
  }
  else x[r, ]
}
> subset2(sample_df2, x > 8)
   x
1  9
2 10
> class(subset2(sample_df2, x > 8))
[1] "data.frame"
#这就对了

#其实还可以更简单,直接在子集选取时加上drop = FALSE就行了
> subset2 <- function(x, condition) {
+   condition_call <- substitute(condition)
+   r <- eval(condition_call, x)
+   x[r,,drop = FALSE ]
+ }
> subset2(sample_df2,x>6)
    x
7   7
8   8
9   9
10 10
> class(subset2(sample_df2,x>6))
[1] "data.frame"
#我靠,原来别人造好了轮子...动动脑子免得老年痴呆(害,我就是笨蛋)
#这个题,大神出题时就给我们答案了,仔细品一下还是有收获的
select <- function(df, vars) {
  vars <- substitute(vars)
  var_pos <- setNames(as.list(seq_along(df)), names(df))
  pos <- eval(vars, var_pos)
  df[, pos, drop = FALSE]
}
select(mtcars, -cyl)
#本质就是把数据框列名和索引值对应上,可以用[]直接选取

作用域问题

#这个地方我看了很久才理解,先看例子,找问题
y <- 4
x <- 4
condition <- 4
condition_call <- 4

subset2(sample_df, a == 4)
#>   a b c
#> 4 4 2 4
subset2(sample_df, a == y)
#>   a b c
#> 4 4 2 4
subset2(sample_df, a == x)
#>       a  b  c
#> 1     1  5  5
#> 2     2  4  3
#> 3     3  3  1
#> 4     4  2  4
#> 5     5  1  1
#> NA   NA NA NA
#> NA.1 NA NA NA
subset2(sample_df, a == condition)
#> Error in eval(condition_call, x): object 'a' not found
subset2(sample_df, a == condition_call)
#> Warning in a == condition_call: longer object length is not a multiple of
#> shorter object length
#> [1] a b c
#> <0 rows> (or 0-length row.names)
#问题在于eval()在数据框(第二个参数)中如果找不到变量,它就会到subset2的环境中去找。很显然我们希望它去数据框的父环境去找
#eval()的第三个参数enclos可以为没有父环境(如列表和数据框)的对象设置一个父参数环境
#如果在env中没有找到相应的对象,就去enclos给出的环境中查找,如果在env中找到就忽略enclos
#稍作修改,重新定义subset2
subset2 <- function(x, condition) {
  condition_call <- substitute(condition)
  r <- eval(condition_call, x, parent.frame())
  x[r, ]
}

x <- 4
subset2(sample_df, a == x)
#>   a b c
#> 4 4 2 4
#好了
#list2env()同样可以实现
subset2a <- function(x, condition) {
  condition_call <- substitute(condition)
  env <- list2env(x, parent = parent.frame())
  r <- eval(condition_call, env)
  x[r, ]
}

x <- 5
subset2a(sample_df, a == x)
#>   a b c
#> 5 5 1 1
#先看一下arrange的源码
> library(plyr)
> arrange
function (df, ...) 
{
    stopifnot(is.data.frame(df))
    ord <- eval(substitute(order(...)), df, parent.frame())
    if (length(ord) != nrow(df)) {
        stop("Length of ordering vectors don't match data frame size", 
            call. = FALSE)
    }
    unrowname(df[ord, , drop = FALSE])
}
<bytecode: 0x00000242fb8fb350>
<environment: namespace:plyr>
#核心函数order(),它返回按大小升序排列后的元素在原向量中的索引值
> order(c(2,1,4,8,5,6))
[1] 2 1 3 5 6 4

#怎么用arrange()?
# sort mtcars data by cylinder and displacement
mtcars[with(mtcars, order(cyl, disp)), ]
# Same result using arrange: no need to use with(), as the context is implicit
# NOTE: plyr functions do NOT preserve row.names
arrange(mtcars, cyl, disp)
# Let's keep the row.names in this example
myCars = cbind(vehicle=row.names(mtcars), mtcars)
arrange(myCars, cyl, disp)
# Sort with displacement in descending order
arrange(myCars, cyl, desc(disp))

#substitute(order(...))不就是返回order(...)嘛。。然后再被eval()求值,达到取子集目的。
#这样便没什么意思了,那自己想个类似的题:自编函数实现order()排序功能。
#order()返回值是索引,还是排序好了的索引
order2 <- function(x){
  index <- data.frame(s=seq(1:length(x)),v=x)
  o <- c()
  for (i in 1:length(x)){
    m <- min(x)
    n <- index[index$v==m,1]
    x <- x[x!=m]
    o[i] <- n
  }
  return(o)
}
> order2(x)
[1] 3 1 2 5 4
> order(x)
[1] 3 1 2 5 4
#实现了,但是如果输入有相同值的话那就凉了。以后慢慢实现
> head(transform(airquality,new=seq(1:nrow(airquality))))
  Ozone Solar.R Wind Temp Month Day new
1    41     190  7.4   67     5   1   1
2    36     118  8.0   72     5   2   2
3    12     149 12.6   74     5   3   3
4    18     313 11.5   62     5   4   4
5    NA      NA 14.3   56     5   5   5
6    28      NA 14.9   66     5   6   6
#等于下面这句
> airquality$new1 <- seq(1:nrow(airquality))
> head(airquality)
  Ozone Solar.R Wind Temp Month Day new1
1    41     190  7.4   67     5   1    1
2    36     118  8.0   72     5   2    2
3    12     149 12.6   74     5   3    3
4    18     313 11.5   62     5   4    4
5    NA      NA 14.3   56     5   5    5
6    28      NA 14.9   66     5   6    6
#log转换
> attach(airquality)
> head(transform(Ozone, logOzone = log(Ozone))) # marginally interesting ...
  X_data logOzone
1     41 3.713572
2     36 3.583519
3     12 2.484907
4     18 2.890372
5     NA       NA
6     28 3.332205
> detach(airquality)
> detach(airquality)
> df <- data.frame(x = 1:5)
> transform(df, x2 = x * x, x3 = x2 * x)
Error in eval(substitute(list(...)), `_data`, parent.frame()) : 
  找不到对象'x2'
> plyr::mutate(df, x2 = x * x, x3 = x2 * x)
  x x2  x3
1 1  1   1
2 2  4   8
3 3  9  27
4 4 16  64
5 5 25 125
#思路也很简单,就是把选取子集和随机取样这两个函数结合起来
#取子集
subset2 <- function(x, condition) {
  condition_call <- substitute(condition)
  r <- eval(condition_call, x, parent.frame())
  x[r, ]
}
#打乱
scramble <- function(x) x[sample(nrow(x)), ]
#组合
subscramble <- function(x, condition) {
  scramble(subset2(x, condition))
}
#运行测试
> sample_df <- data.frame(a = 1:5, b = 5:1, c = c(5, 3, 1, 4, 1))
> subscramble(sample_df, a >= 4)
 Error in a >= 4 : 只能比较(5)基元或串列种类 
5.
eval(condition_call, x, parent.frame()) 
4.
eval(condition_call, x, parent.frame()) 
3.
subset2(x, condition) 
2.
scramble(subset2(x, condition)) 
1.
subscramble(sample_df, a >= 4) 
#问题就在于condition没有被真正传入到subset2函数中
#subset2的condition_call其实是condition
> subset2 <- function(x, condition) {
+   condition_call <- substitute(condition)
+   condition_call
+ }
> subscramble <- function(x, condition) {
+   subset2(x, condition)
+ }
> subscramble(sample_df,a>=4)
condition
#怎么解决?把condition在定义组合函数时直接用substitute()提出来
subset2 <- function(x, condition) {
  r <- eval(condition, x, parent.frame())
  x[r, ]
}

scramble <- function(x) x[sample(nrow(x)), ]

subscramble <- function(x, condition) {
  scramble(subset2(x, substitute(condition)))
}
> subscramble(sample_df,a>=4)
  a b c
5 5 1 1
4 4 2 4
> subscramble(sample_df,a>=4)
  a b c
4 4 2 4
5 5 1 1
library(lattice)
xyplot(mpg ~ disp, data = mtcars)

x <- quote(mpg)
y <- quote(disp)
xyplot(x ~ y, data = mtcars)
#> Error in tmp[subset]: object of type 'symbol' is not subsettable
#其实我们想得到的表达式是
xyplot(mpg ~ disp, data = mtcars)
#求助于substitute()?不行,它只在函数内可以把已知变量的值替换
a <- 1
b <- 2
substitute(a + b + z)
#> a + b + z
#函数内部
f <- function() {
  a <- 1
  b <- 2
  substitute(a + b + z)
}
f()
#> 1 + 2 + z
#函数外pryr::subs()可以实现替换
a <- 1
b <- 2
subs(a + b + z)
#> 1 + 2 + z
#好,现在交互式数据分析就比较舒心了
x <- quote(mpg)
y <- quote(disp)
subs(xyplot(x ~ y, data = mtcars))
#> xyplot(mpg ~ disp, data = mtcars)
#...可传参的原则不变
x <- quote(mpg)
y <- quote(disp)
subs(xyplot(x ~ y, data = mtcars))
#> xyplot(mpg ~ disp, data = mtcars)
#subs()的第二个参数可以重写正在使用的当前环境
subs(a + b, list(a = "y"))
#> "y" + b
subs(a + b, list(a = quote(y)))
#> y + b
subs(a + b, list(a = quote(y())))
#> y() + b
> subs(a + b + c,list("+"=quote(`*`)))
a * b * c
> subs(f(g(a,b),c),list("f"=quote(`*`),"g"=quote(`+`)))
(a + b) * c
> subs(f(a < b, c, d),list("f"=quote(`if`)))
if (a < b) c else d

下课~

上一篇下一篇

猜你喜欢

热点阅读