R语言基础学习7
2021-06-23 本文已影响0人
7f0a92cda77c
人见人爱 tidyverse
tidyr
dplyr
stringr
ggplot2
data:image/s3,"s3://crabby-images/2b928/2b9285214c734354d9a2a4513d91a180090ae207" alt=""
安装好包
options("repos" = c(CRAN="http://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
if(!require(tidyr))install.packages("tidyr",update = F,ask = F)
if(!require(dplyr))install.packages("dplyr",update = F,ask = F)
if(!require(stringr))install.packages('stringr',update = F,ask = F)
1 数据清理 tidyr
1.1 tidyr 根据某一列进行合并gather
或者是拆分spread
test <- data.frame(geneid = paste0("gene",1:4),
sample1 = c(1,4,7,10),
sample2 = c(2,5,0.8,11),
sample3 = c(0.3,6,9,12))
data:image/s3,"s3://crabby-images/bd598/bd5988048862e1ba9bab591c8c23dc1f10abffe6" alt=""
gather
操作
test_gather <- gather(data = test,
key = sample_nm,#需要合并到一列的,一般是种类,sample, Species
value = exp,#合并的数值
- geneid)# " - " exclude y with -y
这里的-
代表排除这个变量,不参与到key中
data:image/s3,"s3://crabby-images/c65e9/c65e91a36a319d401879b13a62dc51c6e24ba24d" alt=""
spread
操作-上述的逆操作
test_re <- spread(data = test_gather,
key = sample_nm,
value = exp)
data:image/s3,"s3://crabby-images/3b5b8/3b5b810b15dc573289bad9a6deb9376f8fa06950" alt=""
1.2 分割和合并
1.2.1 分割 separate
test_seprate <- separate(test,x, c("X1", "X2"),sep = ",")
X1 X2
1 a b
2 a d
3 b c
1.2.2 合并 unite
unite(test_seprate,"x",X1,X2,sep = "****")
x
1 a****b
2 a****d
3 b****c
1.3 处理NA
X<-data.frame(X1 = LETTERS[1:5],X2 = 1:5)
X[2,2] <- NA
X[4,1] <- NA
# 1.去掉含有NA的行,可以选择只根据某一列来去除
drop_na(X)
drop_na(X,X1)
drop_na(X,X2)
### 2.替换NA
replace_na(X$X2,0)
2 dplyr
初步探索
2.1 五个基础函数 mutate()
select()
filter()
arrange()
summarise()
2.2 两个实用技能 管道操作 %>%
; count统计某列的unique值
2.3 处理关系数据 - 将2个表进行连接
2.1
数据集
library(dplyr)
test <- iris[c(1:2,51:52,101:102),]
rownames(test) =NULL
data:image/s3,"s3://crabby-images/38bdd/38bdd32d0778eb6f903a0c5a25383c28c5de2f40" alt=""
###1.mutate(),新增列
mutate(test, new = Sepal.Length * Sepal.Width)
data:image/s3,"s3://crabby-images/389ca/389caf63e2a586b4cf08f5bddda04109598145da" alt=""
#2.select(),按列筛选
####(1)按列号筛选
select(test,1)
select(test,c(1,5))
####(2)按列名筛选
select(test,Sepal.Length)
select(test, Petal.Length, Petal.Width)
vars <- c("Petal.Length", "Petal.Width")
select(test, one_of(vars))
一组来自tidyselect的有用函数
select(test, starts_with("Petal"))#1
select(test, ends_with("Width"))#2
select(test, contains("etal"))#3
select(test, matches(".t."))#4
select(test, everything())#5
select(test, last_col())#6
select(test, last_col(offset = 1))#7 ##offset---Set it to n to select the nth var from the end.
data:image/s3,"s3://crabby-images/1c558/1c558d685d8c0b464dc942d4bb0be212809a2dc2" alt=""
data:image/s3,"s3://crabby-images/ebcac/ebcac46b65f3ca285098b1fcbf9fb068ebf15593" alt=""
data:image/s3,"s3://crabby-images/66e6f/66e6f4f31f7699c578359ad34d2797b43aaf4f2f" alt=""
data:image/s3,"s3://crabby-images/a14d5/a14d52278252cde3cada284ab34a410f95f2e99b" alt=""
data:image/s3,"s3://crabby-images/3ab7b/3ab7b61825a934d2853c3bdab08cd1a2114cd75d" alt=""
data:image/s3,"s3://crabby-images/8f485/8f48539cf41eefe7502c5ec3f646408ea3c496a5" alt=""
data:image/s3,"s3://crabby-images/d8ef8/d8ef8ab8f072096adabc263edb80a93c88738247" alt=""
data:image/s3,"s3://crabby-images/bb43d/bb43d5289e9c796f91f65baa4de70e6e0f3d2ca0" alt=""
###3.filter()筛选行
filter(test, Species == "setosa")
filter(test, Species == "setosa"&Sepal.Length > 5 )
filter(test, Species %in% c("setosa","versicolor"))
data:image/s3,"s3://crabby-images/40192/40192d1f7d4ba93b353bbc14a12b50308a6624e0" alt=""
#4.arrange(),按某一列对整个表格进行排序
arrange(test, Sepal.Length)#默认从小到大排序
arrange(test, desc(Sepal.Length))#用desc从大到小
arrange(test, desc(Sepal.Width),Sepal.Length)#先是从大到小排的基础上,再按照Length从小到大排列;两个条件
data:image/s3,"s3://crabby-images/1f3df/1f3dfa048e638393806dd502bfa68e3be7fe0474" alt=""
#5.summarise():汇总
summarise(test, mean(Sepal.Length), sd(Sepal.Length))# 计算Sepal.Length的平均值和标准差
#对数据进行汇总操作,结合group_by使用实用性强
data:image/s3,"s3://crabby-images/5c167/5c167520c3ab0021cc2c63cb61a2469c1818eb67" alt=""
2.2 两个实用技能 管道操作 %>%
; count统计某列的unique值
#1:管道操作 %>% (cmd/ctr + shift + M)
library(dplyr)
x1 = filter(iris,Sepal.Width>3)
x2 = select(x1,c("Sepal.Length","Sepal.Width" ))
x3 = arrange(x2,Sepal.Length)
这些可以直接使用管道符传送
iris %>%
filter(Sepal.Width>3) %>%
select(c("Sepal.Length","Sepal.Width" ))%>%
arrange(Sepal.Length)
#2:count统计某列的unique值
count(test,Species)
Species n
1 setosa 2
2 versicolor 2
3 virginica 2
2.3 处理关系数据 - 将2个表进行连接
merge(test1,test2,by="name")#按照列名-name 那一列进行连接的
merge(test1,test3,by.x = "name",by.y = "NAME")
data:image/s3,"s3://crabby-images/e7d3f/e7d3f016020241629bed70e74dd2476d404ac664" alt=""
#1.內连inner_join,取交集
inner_join(test1, test2, by = "name")
inner_join(test1,test3,by = c("name"="NAME"))
data:image/s3,"s3://crabby-images/ea40f/ea40ff4f56c4e2f0b79bcf9830dee6017f436ff5" alt=""
###2.左连left_join
left_join(test1, test2, by = 'name')
left_join(test2, test1, by = 'name')
###3.全连full_join
full_join(test1, test2, by = 'name')
###4.半连接:返回能够与y表匹配的x表所有记录semi_join
semi_join(x = test1, y = test2, by = 'name')
###5.反连接:返回无法与y表匹配的x表的所记录anti_join
anti_join(x = test2, y = test1, by = 'name')
#6.数据的简单合并
#在相当于base包里的cbind()函数和rbind()函数;注意,bind_rows()函数需要两个表格列数相同,而bind_cols()函数则需要两个数据框有相同的行数
data:image/s3,"s3://crabby-images/da448/da448579059853bb907cf6ab5ca5d0ebb1801a01" alt=""
https://www.rstudio.com/resources/cheatsheets/
参照Jimmy团队的生信入门课程,不涉及任何利益冲突