生信中级10题
2020-05-24 本文已影响0人
javen_spring
根据R包org.Hs.eg.db找到下面ensembl 基因ID 对应的基因名(symbol)
genes <- c("ENSG00000000003.13","ENSG00000000005.5","ENSG00000000419.11","ENSG00000000457.12","ENSG00000000460.15","ENSG00000000938.11")
library(org.Hs.eg.db)
options(stringsAsFactors = F)
ls("package:org.Hs.eg.db")
ids <- toTable(org.Hs.egENSEMBL2EG)
#等同于ids <- toTable(org.Hs.egENSEMBL)
ids2 <- toTable(org.Hs.egSYMBOL)
str(ids) #查看提取出的数据框变量的特征
str(ids2)
ids$gene_id <- as.numeric(ids$gene_id) #避免合并后gene_id出现错误
ids2$gene_id <- as.numeric(ids2$gene_id)#避免合并后gene_id出现错误
ids_t <- merge(ids,ids2,by="gene_id")
library(stringr)
genes <- str_match(genes,"([:alnum:]*)[.][:digit:]")[,2]
match_results <- ids_t[ids_t$ensembl_id%in%genes,]
match_results
rm(list=ls()) #一键清除
match_results
根据R包hgu133a.db找到下面探针对应的基因名(symbol)
probe_ids <- c('1053_at', '117_at','121_at',"1255_g_at","1316_at","1320_at","1405_i_at","1431_at","1438_at","1487_at","1494_f_at","1598_g_at","160020_at","1729_at","177_at")
library(hgu133a.db)
ls("package:hgu133a.db")
ids <- toTable(hgu133aSYMBOL)
symbols_res <- ids[ids$probe_id%in%probe_ids,]
symbols_res
rm(list=ls()) #一键清除
symbols_res
找到R包CLL内置的数据集的表达矩阵里面的TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图
- 方法一:
library(CLL)
?CLL #查看帮助文档
data(sCLLex)
class(sCLLex) #查看sCLLex是什么对象,决定了怎么提取其中的数据
eSet <- sCLLex
rm(sCLLex)
options(stringsAsFactors = F)
exprSet <- exprs(eSet)
pdata <- pData(eSet)
str(pdata) #查看数据框变量特征后知道pdata$Disease为因子,但stable不是对照水平
group_list <- relevel(pdata$Disease,ref = "stable") #用relevel()将stable设定为对照水平
gpl <- eSet@annotation #用@来取出eSet对象中的注释数据
if (!require(hgu95av2.db)) BiocManager::install("hgu95av2.db")
library(hgu95av2.db)
ls("package:hgu95av2.db")
ids <- toTable(hgu95av2SYMBOL)
expr_df <- as.data.frame(exprSet) #将表达矩阵转为数据框
expr_df$median <- apply(expr_df,1,median) #将数据框添加median一列
expr_df$probe_id <- rownames(expr_df) #添加probe_id一列
expr_df <- merge(expr_df,ids,by="probe_id") #将expr_df与ids按照probe_id进行合并
expr_df <- expr_df[order(expr_df$symbol,expr_df$median,decreasing = T),] #将symbol排序后,再将median从大到小排序
expr_df <- expr_df[!duplicated(expr_df$symbol),] #根据symbol去重复
rownames(expr_df) <- expr_df$symbol
expr_df <- expr_df[,-c(1,ncol(expr_df)-1,ncol(expr_df))]
identical(colnames(expr_df),rownames(pdata)) #确定表达矩阵中的列名和pdata中的行名是否一致
ex_TP53 <- expr_df[rownames(expr_df)== "TP53",]
library(ggpubr)
library(reshape2)
ex_TP53 <- melt(ex_TP53,variable.name = "variable",value = "value")
str(ex_TP53)
df <- ex_TP53
identical(df$variable,colnames(expr_df))
df$group <- group_list
names(df)[2] <- "expression"
p <- ggboxplot(df, x = "group", y = "expression",
color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
add = "jitter", shape = "group")+
stat_compare_means(label.y = 4.0,method = "t.test")
p
ggpubr结果
- 方法二:
suppressPackageStartupMessages(library(CLL))
data(sCLLex)
sCLLex #直接查看数据集(S4对象)中的信息
exprSet=exprs(sCLLex)
##sCLLex是依赖于CLL这个package的一个对象
samples=sampleNames(sCLLex)
samples #查看samples
pdata=pData(sCLLex)
pdata
group_list=as.character(pdata[,2]) #pdata[,2]提取后为因子,需转为字符串
dim(exprSet) #查看基因表达矩阵的维度,即行数及列数(探针数及样本数)
exprSet[1:5,1:5] #当基因集较大时,查看部分数据可减少内存消耗
#BiocManager::install('hgu95av2.db') #安装注释包
library(hgu95av2.db)
ls("package:hgu95av2.db") #查看注释包的内容
ids=toTable(hgu95av2SYMBOL) #toTable可将Bimap对象转为数据框结构
save(ids,exprSet,pdata,file = 'input.Rdata')
length(unique(ids$symbol)) #查看共有多少个symbol,一般有多个探针对应一个symbol
tail(sort(table(ids$symbol))) ##查看基因对应最多多少个探针
table(sort(table(ids$symbol))) #显示总的symbol对应的探针数量并table
plot(table(sort(table(ids$symbol))))
table(rownames(exprSet) %in% ids$probe_id) #注意两个数据集的先后顺序,%in%表示前者中的变量与后者一一比较,返回布尔值
dim(exprSet)
exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,] #取得与注释包交集后的表达矩阵
dim(exprSet)
ids=ids[match(rownames(exprSet),ids$probe_id),] #match相当于%in%,可以匹配返回逻辑值,也可以将前者数据的顺序匹配给后者
head(ids)
exprSet[1:5,1:5]
identical(ids$probe_id,rownames(exprSet)) #identical判断两个对象是否一致
dat=exprSet
ids$median=apply(dat,1,median) #ids新建median这一列,列名为median,同时对dat这个矩阵按行操作,取每一行的中位数,将结果给到median这一列的每一行
ids=ids[order(ids$symbol,ids$median,decreasing = T),]#先对ids$symbol排序,在此基础上按照ids$median中位数从大到小排列的顺序排序,将对应的行赋值为一个新的ids
ids=ids[!duplicated(ids$symbol),]#将symbol这一列取取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果
dat=dat[ids$probe_id,] #新的ids取出probe_id这一列,将dat按照取出的这一列中的每一行组成一个新的dat
rownames(dat)=ids$symbol#把ids的symbol这一列中的每一行给dat作为dat的行名
dat[1:4,1:4] #保留每个基因ID第一次出现的信息
dim(dat)
df <- dat[rownames(dat)== "TP53",]
identical(colnames(dat),rownames(pdata))
df <- data.frame(expression = df,group = group_list)
library(ggpubr)
p <- ggboxplot(df, x = "group", y = "expression",
color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
add = "jitter", shape = "group")+
stat_compare_means(label.y = 4.0,method = "t.test") +
ylab("relative expression of TP53")
p
与第一种方法相比,第二种方法的组别应转为因子,并设定对照stable
- 法三:使用dplyr包
library(dplyr)
data(sCLLex)
class(sCLLex) #查看sCLLex是什么对象,决定了怎么提取其中的数据
eSet <- sCLLex
rm(sCLLex)
exprSet <- exprs(eSet)
pdata <- pData(eSet)
str(pdata) #查看数据框变量特征后知道pdata$Disease为因子,但stable不是对照水平
group_list <- relevel(pdata$Disease,ref = "stable") #用relevel()将stable设定为对照水平
gpl <- eSet@annotation #用@来取出eSet对象中的注释数据
if (!require(hgu95av2.db)) BiocManager::install("hgu95av2.db")
library(hgu95av2.db)
ls("package:hgu95av2.db")
ids <- toTable(hgu95av2SYMBOL)
dat <- exprSet%>%
as.data.frame()%>%
mutate(probe_id=rownames(exprSet),median = apply(exprSet,1,median))
identical(dat$probe_id,rownames(exprSet))
dat <- merge(dat,ids,by = "probe_id")
dat <- dat[order(dat$symbol,dat$median,decreasing= "T"),]
table(sort(table(dat$symbol)))
dat <- dat[!duplicated(dat$symbol),]
rownames(dat) <- dat$symbol
dat <- dat[,-c(1,ncol(dat)-1,ncol(dat))]
identical(colnames(dat),rownames(pdata))
df <- as.numeric(dat[rownames(dat)== "TP53",])
df <- data.frame(expression = df,group = group_list)
library(ggpubr)
p <- ggboxplot(df, x = "group", y = "expression",
color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
add = "jitter", shape = "group")+
stat_compare_means(label.y = 4.0,method = "t.test") +
ylab("relative expression of TP53")
p
第三种方法ggpubr作图
找到BRCA1基因在TCGA数据库的乳腺癌数据集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表达情况
因网络问题,暂无解答。提示:使用http://www.cbioportal.org/index.do 定位数据集:http://www.cbioportal.org/datasets
找到TP53基因在TCGA数据库的乳腺癌数据集的表达量分组看其是否影响生存
提示使用:http://www.oncolnc.org/ 数据库
下载数据集GSE17215的表达矩阵并且提取下面的基因画热图
genes <- c("ACTR3B", "ANLN","BAG1","mBCL2","BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDCA1", "CDH3", "CENPF", "CEP55", "CXXC5", "EGFR", "ERBB2", "ESR1", "EXO1", "FGFR4", "FOXA1", "FOXC1", "GPR160", "GRB7", "KIF2C", "KNTC2", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MLPH", "MMP11", "MYBL2", "MYC", "NAT1", "ORC6L", "PGR", "PHGDH", "PTTG1", "RRM2", "SFRP1", "SLC39A6", "TMEM45B", "TYMS", "UBE2C", "UBE2T")
rm(list=ls())
library(GEOquery)
library(stringr)
library(AnnoProbe)
library(ggplot2)
gse = "GSE17215"
if(require(AnnoProbe)){
if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
load(paste0(gse,"_eSet.Rdata"))
eSet <- gset
rm(gset)
}else{
eSet <- getGEO(gse,
destdir = '.',
getGPL = F)
}
#(1)提取表达矩阵exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
exp = log2(exp+1)
boxplot(exp,cex.axis = 0.75,las = 2,col = rainbow(6))
#(2)提取临床信息
pd <- pData(eSet[[1]])
#(3)提取芯片平台编号
gpl <- eSet[[1]]@annotation
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
#(4)分组信息:
group_list <- ifelse(str_detect(pd$title,"paclitaxel"),"paclitaxel","salinomycin")
group_list=factor(group_list,levels = c("paclitaxel","salinomycin"))
#(5)下载注释信息:
if (!require(hthgu133a.db)) BiocManager::install("hthgu133a.db")
library(hthgu133a.db)
ls("package:hthgu133a.db")
ids <- toTable(hthgu133aSYMBOL)
#(6)数据处理:
table(rownames(exp)%in%ids$probe_id)
exp <- exp[rownames(exp)%in%ids$probe_id,]
identical(ids$probe_id,rownames(exp))
exp <- as.data.frame(exp)
exp$probe_id <- rownames(exp)
ids$median <- apply(exp,1,median)
ids <- ids[order(ids$symbol,ids$median,decreasing = T),]
ids <- ids[!duplicated(ids$symbol),]
exp <- merge(exp,ids,by="probe_id")
exp <- exp[match(ids$probe_id,exp$probe_id),]
identical(ids$probe_id,exp$probe_id)
rownames(exp) <- exp$symbol
exp <- exp[,-c(1,ncol(exp)-1,ncol(exp))]
dat <- exp[rownames(exp)%in%genes,]
dat <- as.matrix(dat)
library(RColorBrewer)
library(pheatmap)
identical(rownames(pd),colnames(dat))
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(dat)
pheatmap(dat,scale = "row",
cluster_cols = T,
cluster_rows = T,
show_rownames = T,
show_colnames = F,
color = colorRampPalette(rev(brewer.pal(n = 7, name =
"RdYlBu")))(100),
annotation_col = annotation_col,
fontsize = 8, #调整图片中文本的大小
legend = T)
下载数据集GSE24673的表达矩阵计算样本的相关性并且绘制热图,需要标记上样本分组信息
rm(list=ls())
library(GEOquery)
library(stringr)
library(AnnoProbe)
library(ggplot2)
gse = "GSE24673"
if(require(AnnoProbe)){
if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
load(paste0(gse,"_eSet.Rdata"))
eSet <- gset
rm(gset)
}else{
eSet <- getGEO(gse,
destdir = '.',
getGPL = F)
}
#(1)提取表达矩阵exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
#exp = log2(exp+1)
ncol(exp)
boxplot(exp,cex.axis = 0.75,las = 2,col = rainbow(11))
#(2)提取临床信息
pd <- pData(eSet[[1]])
#(3)提取芯片平台编号
gpl <- eSet[[1]]@annotation
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
#(4)分组信息:
group_list <- c(rep(c("139","535","984"),each = 3),rep("normal",2))
group_list=factor(group_list,levels = c("normal","139","535","984"))
#(5)下载注释信息:
if (!require(hugene10sttranscriptcluster.db)) BiocManager::install("hugene10sttranscriptcluster.db")
library(hugene10sttranscriptcluster.db)
ls("package:hugene10sttranscriptcluster.db")
ids <- toTable(hugene10sttranscriptclusterSYMBOL)
#(6)数据处理:
table(rownames(exp)%in%ids$probe_id)
exp <- exp[rownames(exp)%in%ids$probe_id,]
ids <- ids[ids$probe_id%in%rownames(exp),]
identical(ids$probe_id,rownames(exp))
ids$median <- apply(exp,1,median)
ids <- ids[order(ids$symbol,ids$median,decreasing = T),]
ids <- ids[!duplicated(ids$symbol),]
exp <- as.data.frame(exp)
exp$probe_id <- rownames(exp)
exp <- merge(exp,ids,by="probe_id")
exp <- exp[match(ids$probe_id,exp$probe_id),]
identical(ids$probe_id,exp$probe_id)
rownames(exp) <- exp$symbol
exp <- exp[,-c(1,ncol(exp)-1,ncol(exp))]
#画PCA图
dat=as.data.frame(t(exp))
library(FactoMineR)#画主成分分析图需要加载这两个包
library(factoextra)
#pca的统一操作走起
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
#palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
pca_plot #浓度椭圆
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
#palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE,
ellipse.type = "confidence",
legend.title = "Groups"
)
pca_plot #置信椭圆
#参见:http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/
#表达矩阵热图:
library(RColorBrewer)
library(pheatmap)
dat=as.data.frame(t(dat))
identical(rownames(pd),colnames(dat))
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(dat)
pheatmap(dat,scale = "row",
cluster_cols = T,
cluster_rows = T,
show_rownames = F,
show_colnames = F,
color = colorRampPalette(rev(brewer.pal(n = 7, name =
"RdYlBu")))(100),
annotation_col = annotation_col,
fontsize = 8,#调整图片中文本的大小
legend = T)
#样本相关性热图:
M <- cor(exp)
pheatmap::pheatmap(M,annotation_col = annotation_col) #(存疑)
PCA
表达矩阵热图
样本相关性矩阵
找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 对应的R的bioconductor注释包,并且安装它!
GPL6244if (!require(hugene10sttranscriptcluster.db)) BiocManager::install("hugene10sttranscriptcluster.db")
library(hugene10sttranscriptcluster.db)
ls("package:hugene10sttranscriptcluster.db")
ids <- toTable(hugene10sttranscriptclusterSYMBOL)
下载数据集GSE42872的表达矩阵,并且分别挑选出 所有样本的(平均表达量/sd/mad/)最大的探针,并且找到它们对应的基因。
(注释包:hugene10sttranscriptcluster.db)
rm(list=ls())
#数据下载
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE42872"
if(require(AnnoProbe)){
if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
load(paste0(gse,"_eSet.Rdata"))
eSet <- gset
rm(gset)
}else{
eSet <- getGEO(gse,
destdir = '.',
getGPL = F)
}
#(1)提取表达矩阵exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
#exp = log2(exp+1)
#(2)提取临床信息
pd <- pData(eSet[[1]])
#(3)调整pd的行名顺序与exp列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
#(4)提取芯片平台编号
gpl <- eSet[[1]]@annotation
save(gse,pd,exp,gpl,file = paste0(gse,"step1output.Rdata"))
#group_list(实验分组)和ids(芯片注释),每次都需要改
rm(list = ls())
load(file = "GSE42872step1output.Rdata")
group_list=ifelse(str_detect(pd$title,"Control"),"control","treat")
#设置参考水平,对照在前,处理在后
group_list = factor(group_list,
levels = c("control","treat"))
#ids BioconductorR包
gpl
#http://www.bio-info-trainee.com/1399.html
if(!require(hugene10sttranscriptcluster.db))BiocManager::install("hugene10sttranscriptcluster.db")
library(hugene10sttranscriptcluster.db)
ls("package:hugene10sttranscriptcluster.db")
ids <- toTable(hugene10sttranscriptclusterSYMBOL)
head(ids)
save(exp,group_list,ids,file = paste0(gse,"step2output.Rdata"))
#数据处理:
table(rownames(exp)%in%ids$probe_id)
exp <- exp[rownames(exp)%in%ids$probe_id,]
ids <- ids[ids$probe_id%in%rownames(exp),]
identical(ids$probe_id,rownames(exp))
ids$median <- apply(exp,1,median) #中位数
ids$sd <- apply(exp,1,sd) #标准差
ids$mad <- apply(exp,1,mad) #中位方差
ids_median <- ids[order(ids$median,decreasing = F),]
median_max <- ids_median[1,]
ids_sd <- ids[order(ids$sd,decreasing = T),]
sd_max <- ids_sd[1,];sd_max
ids_mad <- ids[order(ids$mad,decreasing = T),]
mad_max <- ids_mad[1,];mad_max
所有样本的(平均表达量/sd/mad/)最大的探针及基因名
下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵
load(file = "GSE42872step1output.Rdata")
load("GSE42872step2output.Rdata")
table(rownames(exp)%in%ids$probe_id)
exp <- exp[rownames(exp)%in%ids$probe_id,]
identical(rownames(exp),ids$probe_id)
exp <- as.data.frame(exp)
exp$median <- apply(exp,1,median)
exp$probe_id <- rownames(exp)
exp_m <- merge(exp,ids,by="probe_id")
exp_m <- exp_m[order(exp_m$symbol,exp_m$median,decreasing = T),]
exp_m <- exp_m[!duplicated(exp_m$symbol),]
rownames(exp_m) <- exp_m$symbol
exp_m <- exp_m[,-c(1,ncol(exp_m)-1,ncol(exp_m))]
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp_m,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)
save(exp_m,deg,file="GSE42872limma.Rdata")