生物信息学从零开始学R. python新手日记R语言从零开始

R语言日常笔记(1)filter函数

2019-07-15  本文已影响320人  柳叶刀与小鼠标

R语言日常笔记(1)filter函数

在处理数据时,过滤数据是最基本的操作之一。 如果想删除一部分无效或者根本不感兴趣的数据。 dplyr有filter()函数来做这样的过滤。 使用dplyr,可以帮助使用者像使用SQL或者传统BI工具以简单且更直观的方式进行过滤。

rm(list=ls())

library(nycflights13)

library(dplyr)

flights
> flights
# A tibble: 336,776 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
 1  2013     1     1      517            515         2      830            819        11 UA     
 2  2013     1     1      533            529         4      850            830        20 UA     
 3  2013     1     1      542            540         2      923            850        33 AA     
 4  2013     1     1      544            545        -1     1004           1022       -18 B6     
 5  2013     1     1      554            600        -6      812            837       -25 DL     
 6  2013     1     1      554            558        -4      740            728        12 UA     
 7  2013     1     1      555            600        -5      913            854        19 B6     
 8  2013     1     1      557            600        -3      709            723       -14 EV     
 9  2013     1     1      557            600        -3      838            846        -8 B6     
10  2013     1     1      558            600        -2      753            745         8 AA     
# ... with 336,766 more rows, and 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
#   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
> 
flights %>% 
  select(flight, carrier, origin,  dep_delay,
         dep_delay, arr_delay, air_time)%>% 
  filter(carrier == "UA")
# A tibble: 58,665 x 6
   flight carrier origin dep_delay arr_delay air_time
    <int> <chr>   <chr>      <dbl>     <dbl>    <dbl>
 1   1545 UA      EWR            2        11      227
 2   1714 UA      LGA            4        20      227
 3   1696 UA      EWR           -4        12      150
 4    194 UA      JFK           -2         7      345
 5   1124 UA      EWR           -2       -14      361
 6   1187 UA      EWR           -1        -8      337
 7   1077 UA      EWR            0       -17      157
 8    303 UA      JFK           11        14      366
 9    496 UA      LGA           -4         1      229
10   1665 UA      EWR           -2        29      366
# ... with 58,655 more rows
flights %>% 
  select(flight, carrier, origin,  dep_delay,
         dep_delay, arr_delay, air_time)%>% 
  filter(carrier == "UA" & origin == "LGA")
# A tibble: 8,044 x 6
   flight carrier origin dep_delay arr_delay air_time
    <int> <chr>   <chr>      <dbl>     <dbl>    <dbl>
 1   1714 UA      LGA            4        20      227
 2    496 UA      LGA           -4         1      229
 3    883 UA      LGA            1        -6      243
 4   1092 UA      LGA            9        20      135
 5    473 UA      LGA           -4         3      238
 6    477 UA      LGA            2        -4      249
 7    255 UA      LGA            1        13      154
 8   1004 UA      LGA            2        11      237
 9   1086 UA      LGA          134       145      248
10    405 UA      LGA           13        34      256
# ... with 8,034 more rows
> 
flights %>% 
  select(flight, carrier, origin,  dep_delay,
         dep_delay, arr_delay, air_time)%>% 
  filter(carrier != "UA" & origin == "LGA")
# A tibble: 96,618 x 6
   flight carrier origin dep_delay arr_delay air_time
    <int> <chr>   <chr>      <dbl>     <dbl>    <dbl>
 1    461 DL      LGA           -6       -25      116
 2   5708 EV      LGA           -3       -14       53
 3    301 AA      LGA           -2         8      138
 4    707 AA      LGA           -1        31      257
 5    371 B6      LGA            0        -7      152
 6   4650 MQ      LGA            0        12      134
 7   1919 DL      LGA           -8        -8      170
 8   4401 MQ      LGA           -3        16      105
 9   1837 AA      LGA           13         5      153
10   4599 MQ      LGA           -6        10      166
# ... with 96,608 more rows
> 
flights %>% 
  select(flight, carrier, origin,  dep_delay,
         dep_delay, arr_delay, air_time)%>% 
  filter(carrier %in% c( 'AA'   , 'AS'  ))%>%
  count(carrier)
# A tibble: 2 x 2
  carrier     n
  <chr>   <int>
1 AA      32729
2 AS        714
flights %>% 
  select(flight, carrier, origin,  dep_delay,
         dep_delay, arr_delay, air_time)%>% 
  filter(!carrier %in% c( 'AA'   , 'AS'  ))%>%
  count(carrier)
# A tibble: 14 x 2
   carrier     n
   <chr>   <int>
 1 9E      18460
 2 B6      54635
 3 DL      48110
 4 EV      54173
 5 F9        685
 6 FL       3260
 7 HA        342
 8 MQ      26397
 9 OO         32
10 UA      58665
11 US      20536
12 VX       5162
13 WN      12275
14 YV        601
> 
flights %>% 
  select(flight, carrier, origin,  dep_delay,
         dep_delay, arr_delay, air_time)%>% 
  filter(is.na(arr_delay))
# A tibble: 9,430 x 6
   flight carrier origin dep_delay arr_delay air_time
    <int> <chr>   <chr>      <dbl>     <dbl>    <dbl>
 1   4525 MQ      LGA           -5        NA       NA
 2   3806 EV      EWR           29        NA       NA
 3   4413 MQ      LGA           -5        NA       NA
 4   1228 UA      EWR           29        NA       NA
 5   3325 9E      JFK           59        NA       NA
 6   4333 EV      EWR           22        NA       NA
 7   4204 EV      EWR           46        NA       NA
 8   4308 EV      EWR           NA        NA       NA
 9    791 AA      LGA           NA        NA       NA
10   1925 AA      LGA           NA        NA       NA
# ... with 9,420 more rows

flights %>% 
  select(flight, carrier, origin,  dep_delay,
         dep_delay, arr_delay, air_time)%>% 
  filter(!is.na(arr_delay))
# A tibble: 327,346 x 6
   flight carrier origin dep_delay arr_delay air_time
    <int> <chr>   <chr>      <dbl>     <dbl>    <dbl>
 1   1545 UA      EWR            2        11      227
 2   1714 UA      LGA            4        20      227
 3   1141 AA      JFK            2        33      160
 4    725 B6      JFK           -1       -18      183
 5    461 DL      LGA           -6       -25      116
 6   1696 UA      EWR           -4        12      150
 7    507 B6      EWR           -5        19      158
 8   5708 EV      LGA           -3       -14       53
 9     79 B6      JFK           -3        -8      140
10    301 AA      LGA           -2         8      138
# ... with 327,336 more rows

除了上述的例子,filter函数目前还有filter_all, filter_at,filter_if三个补充函数,这三个函数使得filter函数更加强大。

> library(ggplot2)
> mtcars  %>%
+   head()
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
>  mtcars  %>% 
+       filter_all(any_vars(. > 150))%>% 
+       head()
   mpg cyl disp  hp drat    wt  qsec vs am gear carb
1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
3 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
4 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
5 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
6 14.3   8  360 245 3.21 3.570 15.84  0  0    3    4
> 
 filter_all(mtcars, all_vars(. >0))
   mpg cyl  disp  hp drat    wt  qsec vs am gear carb
1 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
2 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
3 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
4 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
5 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
6 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
7 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
> b <- mtcars[1:8,1:8]
> b
                   mpg cyl  disp  hp drat    wt  qsec vs
Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0
Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0
Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1
Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1
Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0
Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1
Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0
Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1
> filter_all(b, all_vars(. != 0))
#挑选所有列数值均不等于0的行(观测值)
   mpg cyl  disp  hp drat    wt  qsec vs
1 22.8   4 108.0  93 3.85 2.320 18.61  1
2 21.4   6 258.0 110 3.08 3.215 19.44  1
3 18.1   6 225.0 105 2.76 3.460 20.22  1
4 24.4   4 146.7  62 3.69 3.190 20.00  1
> filter_all(b, any_vars(. != 0))
#挑选有任意列数值不等于0的行(观测值)
   mpg cyl  disp  hp drat    wt  qsec vs
1 21.0   6 160.0 110 3.90 2.620 16.46  0
2 21.0   6 160.0 110 3.90 2.875 17.02  0
3 22.8   4 108.0  93 3.85 2.320 18.61  1
4 21.4   6 258.0 110 3.08 3.215 19.44  1
5 18.7   8 360.0 175 3.15 3.440 17.02  0
6 18.1   6 225.0 105 2.76 3.460 20.22  1
7 14.3   8 360.0 245 3.21 3.570 15.84  0
8 24.4   4 146.7  62 3.69 3.190 20.00  1
> filter_all(b, any_vars(. >200))
#挑选有所有列数值大于200的行(观测值)
   mpg cyl disp  hp drat    wt  qsec vs
1 21.4   6  258 110 3.08 3.215 19.44  1
2 18.7   8  360 175 3.15 3.440 17.02  0
3 18.1   6  225 105 2.76 3.460 20.22  1
4 14.3   8  360 245 3.21 3.570 15.84  0
> mtcars  %>% 
+     filter_at(vars(starts_with("d")), any_vars((. %% 2) == 0))
    mpg cyl disp  hp drat    wt  qsec vs am gear carb
1  21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
2  21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
3  22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
4  21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
5  18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
6  14.3   8  360 245 3.21 3.570 15.84  0  0    3    4
7  10.4   8  472 205 2.93 5.250 17.98  0  0    3    4
8  10.4   8  460 215 3.00 5.424 17.82  0  0    3    4
9  14.7   8  440 230 3.23 5.345 17.42  0  0    3    4
10 15.5   8  318 150 2.76 3.520 16.87  0  0    3    2
11 15.2   8  304 150 3.15 3.435 17.30  0  0    3    2
12 13.3   8  350 245 3.73 3.840 15.41  0  0    3    4
13 19.2   8  400 175 3.08 3.845 17.05  0  0    3    2

> b <- mtcars[1:8,1:8]
> b
                   mpg cyl  disp  hp drat    wt  qsec vs
Mazda RX4         21.0   6 160.0 110 3.90 2.620 16.46  0
Mazda RX4 Wag     21.0   6 160.0 110 3.90 2.875 17.02  0
Datsun 710        22.8   4 108.0  93 3.85 2.320 18.61  1
Hornet 4 Drive    21.4   6 258.0 110 3.08 3.215 19.44  1
Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0
Valiant           18.1   6 225.0 105 2.76 3.460 20.22  1
Duster 360        14.3   8 360.0 245 3.21 3.570 15.84  0
Merc 240D         24.4   4 146.7  62 3.69 3.190 20.00  1
#首先将b的所有列的值转换成整数,然后筛选出其中不等0的观测值
> filter_if(b, ~ all(floor(.) == .),all_vars(. != 0))
   mpg cyl  disp  hp drat    wt  qsec vs
1 22.8   4 108.0  93 3.85 2.320 18.61  1
2 21.4   6 258.0 110 3.08 3.215 19.44  1
3 18.1   6 225.0 105 2.76 3.460 20.22  1
4 24.4   4 146.7  62 3.69 3.190 20.00  1

> mtcars %>%
+   head()
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
> 
> 
> vars <- c("mpg", "disp")
> cond <- c(19, 140)
> mtcars %>%
+   as_tibble()%>%
+   filter(
+     .data[[vars[[1]]]] > cond[[1]],
+     .data[[vars[[2]]]] > cond[[2]]
+   )
# A tibble: 8 x 11
    mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
3  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
4  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
5  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
6  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
7  19.2     8  400    175  3.08  3.84  17.0     0     0     3     2
8  19.7     6  145    175  3.62  2.77  15.5     0     1     5     6
> 
> 
> 
> mtcars%>%
+   as_tibble()%>%
+   filter( mpg> 19  & disp > 140)
# A tibble: 8 x 11
    mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
3  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
4  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
5  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
6  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
7  19.2     8  400    175  3.08  3.84  17.0     0     0     3     2
8  19.7     6  145    175  3.62  2.77  15.5     0     1     5     6
上一篇下一篇

猜你喜欢

热点阅读