生物信息学R语言源码R语言作图GSE实战

Tidyverse中使用函数

2019-11-21  本文已影响0人  小梦游仙境

学习tidyverse包前,先对tibble数据类型进行简单的了解,因为下面生成的结果都是以tibble展示的。

tibble是R语言中一个用来替换data.frame类型的扩展的数据框,tibble继承了data.frame,是弱类型的,同时与data.frame有相同的语法,使用起来更方便。tibble包,也是由Hadley开发的R包。

tibble对data.frame做了重新的设定:

1.要学的tidyverse包中的注意的三个数据操作函数

2.加载dplyr和gapminder数据集

library(tidyverse)
library(gapminder)

3.复制一个示例数据集gapminder

(my_gap <- gapminder)
image-20191121221058347
my_gap %>% filter(country == "Canada")#提取出country只为Canada的行
image-20191121221105923
#赋予一个新的变量
my_precious <- my_gap %>% filter(country == "Canada")
image-20191121221114449

4.使用mutate()添加新变量

加入现在想要新添加一列,计算一个国家的GDP,就是人口数量*人均GDP

my_gap %>%
  mutate(gdp = pop * gdpPercap)
image-20191121221124007
ctib <- my_gap %>%
  filter(country == "Canada")
## this is a semi-dangerous way to add this variable
## I'd prefer to join on year, but we haven't covered joins yet
my_gap <- my_gap %>%
  mutate(tmp = rep(ctib$gdpPercap, nlevels(country)),
         gdpPercapRel = gdpPercap / tmp,
         tmp = NULL)
image-20191121221141860
my_gap %>% 
  filter(country == "Canada") %>% 
  select(country, year, gdpPercapRel)

<img src="https://tva1.sinaimg.cn/large/006y8mN6gy1g960nthrfyj30u00ud1hy.jpg" alt="image-20191121221150989" style="zoom:50%;" />

summary(my_gap$gdpPercapRel)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.007236 0.061648 0.171521 0.326659 0.446564 9.534690 

5.arrange()来对数据进行行顺序排序

重新排序数据中的行

> my_gap 
image-20191121221208503
my_gap %>%
  arrange(year, country)
image-20191120181311968

如果想要从2007年开始显示数据,并且按照lifeExp进行排序

> my_gap %>%
+   filter(year == 2007) %>%
+   arrange(lifeExp)
image-20191120181421039

6.用rename()重命名变量

my_gap %>%
  rename(life_exp = lifeExp,
         gdp_percap = gdpPercap,
         gdp_percap_rel = gdpPercapRel)
image-20191120182003646

7.select()重命名和重定位变量

my_gap %>%
  filter(country == "Burundi", year > 1996) %>% 
  select(yr = year, lifeExp, gdpPercap) %>% 
  select(gdpPercap, everything())
image-20191120182254490

8.强大工具group_by()

8.1计数
> my_gap %>%
+   group_by(continent) %>%
+   summarize(n = n())
# A tibble: 5 x 2
  continent     n
  <fct>     <int>
1 Africa      624
2 Americas    300
3 Asia        396
4 Europe      360
5 Oceania      24
my_gap %>%
  group_by(continent) %>%
  tally()
#> # A tibble: 5 x 2
#>   continent     n
#>   <fct>     <int>
#> 1 Africa      624
#> 2 Americas    300
#> 3 Asia        396
#> 4 Europe      360
#> 5 Oceania      24
my_gap %>% 
  count(continent)
#> # A tibble: 5 x 2
#>   continent     n
#>   <fct>     <int>
#> 1 Africa      624
#> 2 Americas    300
#> 3 Asia        396
#> 4 Europe      360
#> 5 Oceania      24
my_gap %>%
  group_by(continent) %>%
  summarize(n = n(),
            n_countries = n_distinct(country))
#> # A tibble: 5 x 3
#>   continent     n n_countries
#>   <fct>     <int>       <int>
#> 1 Africa      624          52
#> 2 Americas    300          25
#> 3 Asia        396          33
#> 4 Europe      360          30
#> 5 Oceania      24           2
8.2 统计一般数据

summarize()可以将经典的数据统计结果如: mean(), median(), var(), sd(), mad(), IQR(), min()max()一同输出

my_gap %>%
  group_by(continent) %>%
  summarize(avg_lifeExp = mean(lifeExp))
#> # A tibble: 5 x 2
#>   continent avg_lifeExp
#>   <fct>           <dbl>
#> 1 Africa           48.9
#> 2 Americas         64.7
#> 3 Asia             60.1
#> 4 Europe           71.9
#> 5 Oceania          74.3

summarize_at可以对不同的变量分别进行函数计算

my_gap %>%
  filter(year %in% c(1952, 2007)) %>%
  group_by(continent, year) %>%
  summarize_at(vars(lifeExp, gdpPercap), list(~mean(.), ~median(.)))
#> # A tibble: 10 x 6
#> # Groups:   continent [5]
#>    continent  year lifeExp_mean gdpPercap_mean lifeExp_median
#>    <fct>     <int>        <dbl>          <dbl>          <dbl>
#>  1 Africa     1952         39.1          1253.           38.8
#>  2 Africa     2007         54.8          3089.           52.9
#>  3 Americas   1952         53.3          4079.           54.7
#>  4 Americas   2007         73.6         11003.           72.9
#>  5 Asia       1952         46.3          5195.           44.9
#>  6 Asia       2007         70.7         12473.           72.4
#>  7 Europe     1952         64.4          5661.           65.9
#>  8 Europe     2007         77.6         25054.           78.6
#>  9 Oceania    1952         69.3         10298.           69.3
#> 10 Oceania    2007         80.7         29810.           80.7
#> # … with 1 more variable: gdpPercap_median <dbl>

如果仅仅关注亚洲,想知道每年最低和最高的lifeExp是多少?

my_gap %>%
  filter(continent == "Asia") %>%
  group_by(year) %>%
  summarize(min_lifeExp = min(lifeExp), max_lifeExp = max(lifeExp))
#> # A tibble: 12 x 3
#>     year min_lifeExp max_lifeExp
#>    <int>       <dbl>       <dbl>
#>  1  1952        28.8        65.4
#>  2  1957        30.3        67.8
#>  3  1962        32.0        69.4
#>  4  1967        34.0        71.4
#>  5  1972        36.1        73.4
#>  6  1977        31.2        75.4
#>  7  1982        39.9        77.1
#>  8  1987        40.8        78.7
#>  9  1992        41.7        79.4
#> 10  1997        41.8        80.7
#> 11  2002        42.1        82  
#> 12  2007        43.8        82.6

https://stat545.com/dplyr-single.html

最后友情宣传生信技能树

上一篇下一篇

猜你喜欢

热点阅读