R语言日常笔记(1)filter函数
2019-07-15 本文已影响320人
柳叶刀与小鼠标
R语言日常笔记(1)filter函数
在处理数据时,过滤数据是最基本的操作之一。 如果想删除一部分无效或者根本不感兴趣的数据。 dplyr有filter()函数来做这样的过滤。 使用dplyr,可以帮助使用者像使用SQL或者传统BI工具以简单且更直观的方式进行过滤。
-
导入数据,这一次主要使用的是flight数据集
rm(list=ls())
library(nycflights13)
library(dplyr)
flights
> flights
# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
1 2013 1 1 517 515 2 830 819 11 UA
2 2013 1 1 533 529 4 850 830 20 UA
3 2013 1 1 542 540 2 923 850 33 AA
4 2013 1 1 544 545 -1 1004 1022 -18 B6
5 2013 1 1 554 600 -6 812 837 -25 DL
6 2013 1 1 554 558 -4 740 728 12 UA
7 2013 1 1 555 600 -5 913 854 19 B6
8 2013 1 1 557 600 -3 709 723 -14 EV
9 2013 1 1 557 600 -3 838 846 -8 B6
10 2013 1 1 558 600 -2 753 745 8 AA
# ... with 336,766 more rows, and 9 more variables: flight <int>, tailnum <chr>, origin <chr>,
# dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
>
-
挑选carrier列为UA的行(观测值)
flights %>%
select(flight, carrier, origin, dep_delay,
dep_delay, arr_delay, air_time)%>%
filter(carrier == "UA")
# A tibble: 58,665 x 6
flight carrier origin dep_delay arr_delay air_time
<int> <chr> <chr> <dbl> <dbl> <dbl>
1 1545 UA EWR 2 11 227
2 1714 UA LGA 4 20 227
3 1696 UA EWR -4 12 150
4 194 UA JFK -2 7 345
5 1124 UA EWR -2 -14 361
6 1187 UA EWR -1 -8 337
7 1077 UA EWR 0 -17 157
8 303 UA JFK 11 14 366
9 496 UA LGA -4 1 229
10 1665 UA EWR -2 29 366
# ... with 58,655 more rows
-
挑选carrier列为UA同时origin为LGA的行(观测值)
flights %>%
select(flight, carrier, origin, dep_delay,
dep_delay, arr_delay, air_time)%>%
filter(carrier == "UA" & origin == "LGA")
# A tibble: 8,044 x 6
flight carrier origin dep_delay arr_delay air_time
<int> <chr> <chr> <dbl> <dbl> <dbl>
1 1714 UA LGA 4 20 227
2 496 UA LGA -4 1 229
3 883 UA LGA 1 -6 243
4 1092 UA LGA 9 20 135
5 473 UA LGA -4 3 238
6 477 UA LGA 2 -4 249
7 255 UA LGA 1 13 154
8 1004 UA LGA 2 11 237
9 1086 UA LGA 134 145 248
10 405 UA LGA 13 34 256
# ... with 8,034 more rows
>
-
挑选carrier列不为UA同时origin为LGA的行(观测值)
flights %>%
select(flight, carrier, origin, dep_delay,
dep_delay, arr_delay, air_time)%>%
filter(carrier != "UA" & origin == "LGA")
# A tibble: 96,618 x 6
flight carrier origin dep_delay arr_delay air_time
<int> <chr> <chr> <dbl> <dbl> <dbl>
1 461 DL LGA -6 -25 116
2 5708 EV LGA -3 -14 53
3 301 AA LGA -2 8 138
4 707 AA LGA -1 31 257
5 371 B6 LGA 0 -7 152
6 4650 MQ LGA 0 12 134
7 1919 DL LGA -8 -8 170
8 4401 MQ LGA -3 16 105
9 1837 AA LGA 13 5 153
10 4599 MQ LGA -6 10 166
# ... with 96,608 more rows
>
-
挑选carrier列为UA或者AS行(观测值)
flights %>%
select(flight, carrier, origin, dep_delay,
dep_delay, arr_delay, air_time)%>%
filter(carrier %in% c( 'AA' , 'AS' ))%>%
count(carrier)
# A tibble: 2 x 2
carrier n
<chr> <int>
1 AA 32729
2 AS 714
-
挑选carrier列不为UA或者AS行(观测值)
flights %>%
select(flight, carrier, origin, dep_delay,
dep_delay, arr_delay, air_time)%>%
filter(!carrier %in% c( 'AA' , 'AS' ))%>%
count(carrier)
# A tibble: 14 x 2
carrier n
<chr> <int>
1 9E 18460
2 B6 54635
3 DL 48110
4 EV 54173
5 F9 685
6 FL 3260
7 HA 342
8 MQ 26397
9 OO 32
10 UA 58665
11 US 20536
12 VX 5162
13 WN 12275
14 YV 601
>
-
挑选arr_delay列为NA的行(观测值)
flights %>%
select(flight, carrier, origin, dep_delay,
dep_delay, arr_delay, air_time)%>%
filter(is.na(arr_delay))
# A tibble: 9,430 x 6
flight carrier origin dep_delay arr_delay air_time
<int> <chr> <chr> <dbl> <dbl> <dbl>
1 4525 MQ LGA -5 NA NA
2 3806 EV EWR 29 NA NA
3 4413 MQ LGA -5 NA NA
4 1228 UA EWR 29 NA NA
5 3325 9E JFK 59 NA NA
6 4333 EV EWR 22 NA NA
7 4204 EV EWR 46 NA NA
8 4308 EV EWR NA NA NA
9 791 AA LGA NA NA NA
10 1925 AA LGA NA NA NA
# ... with 9,420 more rows
-
挑选arr_delay列不为NA的行(观测值)
flights %>%
select(flight, carrier, origin, dep_delay,
dep_delay, arr_delay, air_time)%>%
filter(!is.na(arr_delay))
# A tibble: 327,346 x 6
flight carrier origin dep_delay arr_delay air_time
<int> <chr> <chr> <dbl> <dbl> <dbl>
1 1545 UA EWR 2 11 227
2 1714 UA LGA 4 20 227
3 1141 AA JFK 2 33 160
4 725 B6 JFK -1 -18 183
5 461 DL LGA -6 -25 116
6 1696 UA EWR -4 12 150
7 507 B6 EWR -5 19 158
8 5708 EV LGA -3 -14 53
9 79 B6 JFK -3 -8 140
10 301 AA LGA -2 8 138
# ... with 327,336 more rows
除了上述的例子,filter函数目前还有filter_all, filter_at,filter_if三个补充函数,这三个函数使得filter函数更加强大。
-
filter_all函数(一般用于使用者想对数据集所有列进行筛选操作时)
> library(ggplot2)
> mtcars %>%
+ head()
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
-
挑选任意一列数值大于150的行(观测值)
> mtcars %>%
+ filter_all(any_vars(. > 150))%>%
+ head()
mpg cyl disp hp drat wt qsec vs am gear carb
1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
3 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
4 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
5 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
6 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4
>
-
挑选所有列数值均大于的行(观测值)
filter_all(mtcars, all_vars(. >0))
mpg cyl disp hp drat wt qsec vs am gear carb
1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
2 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
3 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
4 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
5 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
6 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
7 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
-
其他操作
> b <- mtcars[1:8,1:8]
> b
mpg cyl disp hp drat wt qsec vs
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1
> filter_all(b, all_vars(. != 0))
#挑选所有列数值均不等于0的行(观测值)
mpg cyl disp hp drat wt qsec vs
1 22.8 4 108.0 93 3.85 2.320 18.61 1
2 21.4 6 258.0 110 3.08 3.215 19.44 1
3 18.1 6 225.0 105 2.76 3.460 20.22 1
4 24.4 4 146.7 62 3.69 3.190 20.00 1
> filter_all(b, any_vars(. != 0))
#挑选有任意列数值不等于0的行(观测值)
mpg cyl disp hp drat wt qsec vs
1 21.0 6 160.0 110 3.90 2.620 16.46 0
2 21.0 6 160.0 110 3.90 2.875 17.02 0
3 22.8 4 108.0 93 3.85 2.320 18.61 1
4 21.4 6 258.0 110 3.08 3.215 19.44 1
5 18.7 8 360.0 175 3.15 3.440 17.02 0
6 18.1 6 225.0 105 2.76 3.460 20.22 1
7 14.3 8 360.0 245 3.21 3.570 15.84 0
8 24.4 4 146.7 62 3.69 3.190 20.00 1
> filter_all(b, any_vars(. >200))
#挑选有所有列数值大于200的行(观测值)
mpg cyl disp hp drat wt qsec vs
1 21.4 6 258 110 3.08 3.215 19.44 1
2 18.7 8 360 175 3.15 3.440 17.02 0
3 18.1 6 225 105 2.76 3.460 20.22 1
4 14.3 8 360 245 3.21 3.570 15.84 0
-
filter_at函数(一般用于使用者想对数据集中的符合条件的列进行筛选操作时(注:虽然是以特定列进行筛选的,但是最后呈现的数据还是全部列)
> mtcars %>%
+ filter_at(vars(starts_with("d")), any_vars((. %% 2) == 0))
mpg cyl disp hp drat wt qsec vs am gear carb
1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
6 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4
7 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
8 10.4 8 460 215 3.00 5.424 17.82 0 0 3 4
9 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
10 15.5 8 318 150 2.76 3.520 16.87 0 0 3 2
11 15.2 8 304 150 3.15 3.435 17.30 0 0 3 2
12 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
13 19.2 8 400 175 3.08 3.845 17.05 0 0 3 2
-
filter_if函数(一般用于使用者想对数据集中的符合条件的列进行转换,然后进行观测值的筛选时)。
> b <- mtcars[1:8,1:8]
> b
mpg cyl disp hp drat wt qsec vs
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1
#首先将b的所有列的值转换成整数,然后筛选出其中不等0的观测值
> filter_if(b, ~ all(floor(.) == .),all_vars(. != 0))
mpg cyl disp hp drat wt qsec vs
1 22.8 4 108.0 93 3.85 2.320 18.61 1
2 21.4 6 258.0 110 3.08 3.215 19.44 1
3 18.1 6 225.0 105 2.76 3.460 20.22 1
4 24.4 4 146.7 62 3.69 3.190 20.00 1
-
当有多个列需要进行条件筛选时(下列两种方法结果一致,都是基于mpg和disp进行行筛选)
> mtcars %>%
+ head()
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
>
>
> vars <- c("mpg", "disp")
> cond <- c(19, 140)
> mtcars %>%
+ as_tibble()%>%
+ filter(
+ .data[[vars[[1]]]] > cond[[1]],
+ .data[[vars[[2]]]] > cond[[2]]
+ )
# A tibble: 8 x 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
4 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
5 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
6 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
7 19.2 8 400 175 3.08 3.84 17.0 0 0 3 2
8 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
>
>
>
> mtcars%>%
+ as_tibble()%>%
+ filter( mpg> 19 & disp > 140)
# A tibble: 8 x 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
4 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
5 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
6 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
7 19.2 8 400 175 3.08 3.84 17.0 0 0 3 2
8 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6