R语言学习--使用dplyr进行数据转换

2021-01-14 本文已影响0人 PhageNanoenzyme

dplyr是生信技能树生信爆款入门课程R语言部分Day7的讲到的一个重要知识点。为加深理解，现在找个数据集做下练习巩固。

函数简介

tidyverse是为数据科学设计的R软件包，它包含(ggplot2、dplyr、tidyr、stringr、magrittr、tibble)等一系列热门软件包，首先学习下dplyr和管道符号

准备并查看测试数据

> 查看数据
> library(tidyverse) 
> set.seed(123)
> diamonds <- diamonds[sample(nrow(diamonds), 10),]
> head(diamonds)
   carat       cut color clarity depth table price    x    y    z
3   0.31     Ideal     D     VS1  61.6    55   713 4.30 4.33 2.66
10  1.10 Very Good     I     SI1  61.2    61  4640 6.61 6.66 4.01
2   0.70     Ideal     G     VS1  60.8    56  3300 5.73 5.80 3.51
8   0.70      Good     H     SI1  64.2    58  1771 5.59 5.62 3.60
6   0.83      Good     E     SI1  63.7    59  3250 5.95 5.89 3.77
9   0.40     Ideal     E     VS1  61.6    56  1053 4.73 4.78 2.93  
> diamonds <- as.data.frame(diamonds)
> attributes(diamonds) #查看数据属性
$names
 [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"   "x"       "y"      
[10] "z"      

$row.names
 [1]  3 10  2  8  6  9  1  7  5  4

$class
[1] "data.frame"

> 可以看到数据有10列，10行，数据类型为数据框；
> unique(diamonds$cut)
[1] Ideal     Very Good Good     
Levels: Fair < Good < Very Good < Premium < Ideal

1 按名称选取carat,cut,price列

> select(diamonds,carat,cut,price)
   carat       cut price
3   0.31     Ideal   713
10  1.10 Very Good  4640
2   0.70     Ideal  3300
8   0.70      Good  1771
6   0.83      Good  3250
9   0.40     Ideal  1053
1   0.73     Ideal  2397
7   0.51 Very Good  1668
5   0.31     Ideal   987
4   0.31     Ideal   707
> p <- select(diamonds,carat,cut,price)
> 接着用此数据进行一个最基础的可视化：
> ggplot(p,aes(carat,price))+
+   geom_point(aes(color=cut),size=2)

image.png

> select选择两列之间的所有列
> select(diamonds,carat:price)
   carat       cut color clarity depth table price
3   0.31     Ideal     D     VS1  61.6    55   713
10  1.10 Very Good     I     SI1  61.2    61  4640
2   0.70     Ideal     G     VS1  60.8    56  3300
8   0.70      Good     H     SI1  64.2    58  1771
6   0.83      Good     E     SI1  63.7    59  3250
9   0.40     Ideal     E     VS1  61.6    56  1053
1   0.73     Ideal     I     VS1  60.7    56  2397
7   0.51 Very Good     D     VS2  62.5    58  1668
5   0.31     Ideal     E      IF  60.9    55   987
4   0.31     Ideal     H    VVS1  62.2    56   707
> select选择不在两列之间的所有列
> select(diamonds,-(carat:price))
      x    y    z
3  4.30 4.33 2.66
10 6.61 6.66 4.01
2  5.73 5.80 3.51
8  5.59 5.62 3.60
6  5.95 5.89 3.77
9  4.73 4.78 2.93
1  5.85 5.81 3.54
7  5.12 5.18 3.22
5  4.39 4.41 2.68
4  4.34 4.37 2.71

2 filter(按carat >=0.5,price >=3000筛选行）

> filter(diamonds,carat >=0.5,price >=3000)
  carat       cut color clarity depth table price    x    y    z
1  1.10 Very Good     I     SI1  61.2    61  4640 6.61 6.66 4.01
2  0.70     Ideal     G     VS1  60.8    56  3300 5.73 5.80 3.51
3  0.83      Good     E     SI1  63.7    59  3250 5.95 5.89 3.77

3 根据price的数据进行排序，默认为升序

提示：arrange(改变行顺序），
 arrange(diamonds,price)        
   carat       cut color clarity depth table price    x    y    z
1   0.31     Ideal     H    VVS1  62.2    56   707 4.34 4.37 2.71
2   0.31     Ideal     D     VS1  61.6    55   713 4.30 4.33 2.66
3   0.31     Ideal     E      IF  60.9    55   987 4.39 4.41 2.68
4   0.40     Ideal     E     VS1  61.6    56  1053 4.73 4.78 2.93
5   0.51 Very Good     D     VS2  62.5    58  1668 5.12 5.18 3.22
6   0.70      Good     H     SI1  64.2    58  1771 5.59 5.62 3.60
7   0.73     Ideal     I     VS1  60.7    56  2397 5.85 5.81 3.54
8   0.83      Good     E     SI1  63.7    59  3250 5.95 5.89 3.77
9   0.70     Ideal     G     VS1  60.8    56  3300 5.73 5.80 3.51
10  1.10 Very Good     I     SI1  61.2    61  4640 6.61 6.66 4.01
> #desc()可以按列进行降序排序：
> arrange(diamonds,desc(price))
   carat       cut color clarity depth table price    x    y    z
1   1.10 Very Good     I     SI1  61.2    61  4640 6.61 6.66 4.01
2   0.70     Ideal     G     VS1  60.8    56  3300 5.73 5.80 3.51
3   0.83      Good     E     SI1  63.7    59  3250 5.95 5.89 3.77
4   0.73     Ideal     I     VS1  60.7    56  2397 5.85 5.81 3.54
5   0.70      Good     H     SI1  64.2    58  1771 5.59 5.62 3.60
6   0.51 Very Good     D     VS2  62.5    58  1668 5.12 5.18 3.22
7   0.40     Ideal     E     VS1  61.6    56  1053 4.73 4.78 2.93
8   0.31     Ideal     E      IF  60.9    55   987 4.39 4.41 2.68
9   0.31     Ideal     D     VS1  61.6    55   713 4.30 4.33 2.66
10  0.31     Ideal     H    VVS1  62.2    56   707 4.34 4.37 2.71

4 将列名price改为prices

> 提示：rename(更改列名称)新名称在前，原始名称在后
> rename(diamonds,prices=price)
   carat       cut color clarity depth table prices    x    y    z
3   0.31     Ideal     D     VS1  61.6    55    713 4.30 4.33 2.66
10  1.10 Very Good     I     SI1  61.2    61   4640 6.61 6.66 4.01
2   0.70     Ideal     G     VS1  60.8    56   3300 5.73 5.80 3.51
8   0.70      Good     H     SI1  64.2    58   1771 5.59 5.62 3.60
6   0.83      Good     E     SI1  63.7    59   3250 5.95 5.89 3.77
9   0.40     Ideal     E     VS1  61.6    56   1053 4.73 4.78 2.93
1   0.73     Ideal     I     VS1  60.7    56   2397 5.85 5.81 3.54
7   0.51 Very Good     D     VS2  62.5    58   1668 5.12 5.18 3.22
5   0.31     Ideal     E      IF  60.9    55    987 4.39 4.41 2.68
4   0.31     Ideal     H    VVS1  62.2    56    707 4.34 4.37 2.71

5 添加两列，group ="A",Length=10

mutate(添加新列）
> mutate(diamonds,group ="A",Length=10)
   carat       cut color clarity depth table price    x    y    z group Length
1   0.31     Ideal     D     VS1  61.6    55   713 4.30 4.33 2.66     A     10
2   1.10 Very Good     I     SI1  61.2    61  4640 6.61 6.66 4.01     A     10
3   0.70     Ideal     G     VS1  60.8    56  3300 5.73 5.80 3.51     A     10
4   0.70      Good     H     SI1  64.2    58  1771 5.59 5.62 3.60     A     10
5   0.83      Good     E     SI1  63.7    59  3250 5.95 5.89 3.77     A     10
6   0.40     Ideal     E     VS1  61.6    56  1053 4.73 4.78 2.93     A     10
7   0.73     Ideal     I     VS1  60.7    56  2397 5.85 5.81 3.54     A     10
8   0.51 Very Good     D     VS2  62.5    58  1668 5.12 5.18 3.22     A     10
9   0.31     Ideal     E      IF  60.9    55   987 4.39 4.41 2.68     A     10
10  0.31     Ideal     H    VVS1  62.2    56   707 4.34 4.37 2.71     A     10

6 使用summarize求price的平均值，carat的标准差。

> # summarize它可以将数据框折叠成一行
>            
> summarize(diamonds,mean(price),
+                      sd(carat))
  mean(price) sd(carat)
1      2048.6 0.2664999

7 使用group_by()求cut组每个数的统计值

> group_by可以将分析单位从整个数据集更改为单个分组            
> diamonds %>% group_by(cut) %>% 
+              summarize(m = mean(price,na.rm=T))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 3 x 2
  cut           m
  <ord>     <dbl>
1 Good      2510.
2 Very Good 3154 
3 Ideal     1526.
> na.rm=T 表示移除缺失数据

8 将下面的代码使用%>%(管道)符号重写

> p1 <- filter(diamonds,carat >=0.5,price >=3000)
> p1
  carat       cut color clarity depth table price    x    y    z
1  1.10 Very Good     I     SI1  61.2    61  4640 6.61 6.66 4.01
2  0.70     Ideal     G     VS1  60.8    56  3300 5.73 5.80 3.51
3  0.83      Good     E     SI1  63.7    59  3250 5.95 5.89 3.77

> p2 <- group_by(p1,cut)
> p2
# A tibble: 3 x 10
# Groups:   cut [3]
  carat cut       color clarity depth table price     x     y     z
  <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1  1.1  Very Good I     SI1      61.2    61  4640  6.61  6.66  4.01
2  0.7  Ideal     G     VS1      60.8    56  3300  5.73  5.8   3.51
3  0.83 Good      E     SI1      63.7    59  3250  5.95  5.89  3.77

> p3 <- filter(p2,cut=='Premium')
> p3
# A tibble: 0 x 10
# Groups:   cut [0]
# ... with 10 variables: carat <dbl>, cut <ord>, color <ord>, clarity <ord>, depth <dbl>,
#   table <dbl>, price <int>, x <dbl>, y <dbl>, z <dbl>
> ggplot(p3,aes(carat,price))+
+ geom_point(aes(color=cut),size=2)

使用管道符号

> diamonds %>% 
+   filter(carat >=0.5,price >=3000) %>%
+ group_by(cut) %>% 
+   filter(cut=='Premium') %>%
+ ggplot(aes(carat,price))+
+ geom_point(aes(color=cut),size=2)
> 
> # 这2段代码结果相同，可以明显看到使用了%>%减少了中间变量，提高了代码的可阅读性
> # diamonds %>% 
> #   filter(.,carat >=0.5,price >=3000)
> # 管道的原理就是将%>%左边的变量传递到右边的.处，通常在正式书写时可省略.

9 使用count() 计算cut每组值的次数

> diamonds %>% count(cut)
        cut n
1      Good 2
2 Very Good 2
3     Ideal 6

10.判断是否存在1克拉价格5000的钻石

> filter(diamonds,carat == 1,price == 5000)
 [1] carat   cut     color   clarity depth   table   price   x       y       z      
<0 行> (或0-长度的row.names)
> 返回0行，说明不存在。

R语言学习--使用dplyr进行数据转换

函数简介

准备并查看测试数据

1 按名称选取carat,cut,price列

2 filter(按carat >=0.5,price >=3000筛选行）

3 根据price的数据进行排序，默认为升序

4 将列名price改为prices

5 添加两列，group ="A",Length=10

6 使用summarize求price的平均值，carat的标准差。

7 使用group_by()求cut组每个数的统计值

8 将下面的代码使用%>%(管道)符号重写

使用管道符号

9 使用count() 计算cut每组值的次数

10.判断是否存在1克拉价格5000的钻石

猜你喜欢

热点阅读