学习小组Day5笔记--尹露茜

2019-07-06 本文已影响0人尹露茜

新手教程撸着撸着
感觉来了一点

（以下知识点均来源于生信星球）
注：#%>%是管道操作符，将第一个函数的结果输出为第二个结果的操作文件，可以少些重复。

今日主题两个包

tidyr

dplyr

tidyr的功能

（1）数据框的变形
（2）处理数据框中的空值
（3）根据一个表格衍生出其他表格
（4）实现行或列的分割和合并

建个表格

data.frame

a <- data.frame(country=c("A","B","C"),"1999"=paste(c (0.7,37,212),"K"),"2000"=paste(c(2,80,213),"K"))

重塑数据

reshape data ---gather/spread

gather(data,key,value,…,na.rm=FALSE, convert=FALSE, factor_key = FALSE)
spread(data,key, value, fill = NA, convert = FALSE, drop = TRUE, sep = NULL)

处理丢失的数据

drop_na(data,...) #有空值的删除整行
fill(data,..., direction = c("down", "up")) #根据上下文蒙一个
replace_na(data, replace = list(),...) #同一列的空值填上同一个数

分割合并

separate #按列分割
separate_rows #按行分割
unite #分割完了再合并回去

separate(data,col,into,sep = "", remove = TRUE, convert = FALSE, ...)
separate_rows(data,...,sep = "", convert = FALSE)
unite(data,col, ..., sep = "_", remove = TRUE)

dplyr的功能

对表格进行操作，操作格式必须是tidy data

arrange #排序

arrange(frame1,geneid) #这是刚才准备测试数据时写下的
arrange(frame1,geneid,expression)#在按col1排序的基础上，按col2排序
arrange(frame1,geneid,desc(expression))

filter #筛选

filter(frame1,expression>3)
filter(frame1,expression>3|geneid=="gene2")   #注意表示or或者。
filter(frame1,expression>3) %>% arrange(expression)  #筛选后排序

distinct #去除重复行

distinct(frame1,geneid)
distinct(frame1,geneid,Sampleid)

select #按列筛选

select(frame3,geneid,expression)  #选择特定两列
select(frame3,-Sampleid) #反选
select(frame3,contains("n"))  #列名包含n的列
select(frame3,starts_with("a"))  #以a开头的列

mutate #根据原有的列生成新的列

mutate(frame3,E=expression *10)  #生成新列E是expression列值的10倍
mutate(frame3,E=expression*10) %>% select(-expression)  #生成新列后去掉把原有的expression列
mutate(frame3,id=paste("ath",geneid,sep = "_")) %>% select(-geneid) #在列添加前缀ath
mutate(frame3,id=paste("ath",geneid,sep = "_")) %>% select(id,Sampleid,expression)  #和上一行本质上是一样的

summarise #对数据进行简单统计

frame3 %>% group_by(geneid)%>%summarise(avg=mean(expression))#按照geneid分组并求平均值（更有意义）

bind_rows #表格拼接（按行拼接）

rbind(frame1,frame4)
frame1 %>%bind_rows(frame4)

交集、并集、全集

intersect(frame1,frame4)
union(frame1,frame4)
union_all(frame1,frame4)

关联

right_join(frame1,frame2) #右连接--把表1添加到表2
inner_join(frame1,frame2) #内连接--只保留两个表格共有的行
by=c("col1"="col2") #当在两个表格中列名不一样时需要在括号内加-col1和2分别是在两个表格中的需合并的列名
semi_join #只保留第二个表格中包含的id
anti_join #只保留第二个表格中不包含的id

屏幕快照 2019-07-06 上午10.29.14.png

屏幕快照 2019-07-06 上午10.28.09.png