生信

生信中级10题

2020-05-24  本文已影响0人  javen_spring

根据R包org.Hs.eg.db找到下面ensembl 基因ID 对应的基因名(symbol)

genes <- c("ENSG00000000003.13","ENSG00000000005.5","ENSG00000000419.11","ENSG00000000457.12","ENSG00000000460.15","ENSG00000000938.11")

library(org.Hs.eg.db)
options(stringsAsFactors = F)
ls("package:org.Hs.eg.db")
ids <- toTable(org.Hs.egENSEMBL2EG)
#等同于ids <- toTable(org.Hs.egENSEMBL)
ids2 <- toTable(org.Hs.egSYMBOL)
str(ids) #查看提取出的数据框变量的特征
str(ids2)
ids$gene_id <- as.numeric(ids$gene_id)  #避免合并后gene_id出现错误
ids2$gene_id <- as.numeric(ids2$gene_id)#避免合并后gene_id出现错误
ids_t <- merge(ids,ids2,by="gene_id")

library(stringr)
genes <- str_match(genes,"([:alnum:]*)[.][:digit:]")[,2]
match_results <- ids_t[ids_t$ensembl_id%in%genes,]
match_results

rm(list=ls())  #一键清除
match_results

根据R包hgu133a.db找到下面探针对应的基因名(symbol)

probe_ids <- c('1053_at', '117_at','121_at',"1255_g_at","1316_at","1320_at","1405_i_at","1431_at","1438_at","1487_at","1494_f_at","1598_g_at","160020_at","1729_at","177_at")

library(hgu133a.db)
ls("package:hgu133a.db")
ids <- toTable(hgu133aSYMBOL)
symbols_res <- ids[ids$probe_id%in%probe_ids,]
symbols_res

rm(list=ls())  #一键清除
symbols_res

找到R包CLL内置的数据集的表达矩阵里面的TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图

library(CLL)
?CLL  #查看帮助文档
data(sCLLex)
class(sCLLex)  #查看sCLLex是什么对象,决定了怎么提取其中的数据
eSet <- sCLLex
rm(sCLLex)
options(stringsAsFactors = F)
exprSet <- exprs(eSet)
pdata <- pData(eSet)
str(pdata)  #查看数据框变量特征后知道pdata$Disease为因子,但stable不是对照水平
group_list <- relevel(pdata$Disease,ref = "stable")  #用relevel()将stable设定为对照水平

gpl <- eSet@annotation  #用@来取出eSet对象中的注释数据
if (!require(hgu95av2.db)) BiocManager::install("hgu95av2.db")
library(hgu95av2.db)
ls("package:hgu95av2.db")
ids <- toTable(hgu95av2SYMBOL)

expr_df <- as.data.frame(exprSet) #将表达矩阵转为数据框
expr_df$median <- apply(expr_df,1,median)  #将数据框添加median一列
expr_df$probe_id <- rownames(expr_df) #添加probe_id一列
expr_df <- merge(expr_df,ids,by="probe_id")  #将expr_df与ids按照probe_id进行合并

expr_df <- expr_df[order(expr_df$symbol,expr_df$median,decreasing = T),]  #将symbol排序后,再将median从大到小排序
expr_df <- expr_df[!duplicated(expr_df$symbol),]  #根据symbol去重复

rownames(expr_df) <- expr_df$symbol
expr_df <- expr_df[,-c(1,ncol(expr_df)-1,ncol(expr_df))]

identical(colnames(expr_df),rownames(pdata))  #确定表达矩阵中的列名和pdata中的行名是否一致

ex_TP53 <- expr_df[rownames(expr_df)== "TP53",]

library(ggpubr)
library(reshape2)
ex_TP53 <- melt(ex_TP53,variable.name = "variable",value = "value")
str(ex_TP53)
df <- ex_TP53
identical(df$variable,colnames(expr_df))
df$group <- group_list
names(df)[2] <- "expression"
p <- ggboxplot(df, x = "group", y = "expression",
               color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
               add = "jitter", shape = "group")+
  stat_compare_means(label.y = 4.0,method = "t.test")  
p
ggpubr结果
suppressPackageStartupMessages(library(CLL))
data(sCLLex)
sCLLex  #直接查看数据集(S4对象)中的信息
exprSet=exprs(sCLLex)   
##sCLLex是依赖于CLL这个package的一个对象
samples=sampleNames(sCLLex)
samples  #查看samples
pdata=pData(sCLLex)
pdata
group_list=as.character(pdata[,2]) #pdata[,2]提取后为因子,需转为字符串
dim(exprSet)  #查看基因表达矩阵的维度,即行数及列数(探针数及样本数)
exprSet[1:5,1:5]  #当基因集较大时,查看部分数据可减少内存消耗

#BiocManager::install('hgu95av2.db')  #安装注释包
library(hgu95av2.db)
ls("package:hgu95av2.db") #查看注释包的内容

ids=toTable(hgu95av2SYMBOL)  #toTable可将Bimap对象转为数据框结构
save(ids,exprSet,pdata,file = 'input.Rdata')
length(unique(ids$symbol))  #查看共有多少个symbol,一般有多个探针对应一个symbol
tail(sort(table(ids$symbol)))  ##查看基因对应最多多少个探针
table(sort(table(ids$symbol)))  #显示总的symbol对应的探针数量并table

plot(table(sort(table(ids$symbol))))

table(rownames(exprSet) %in% ids$probe_id)  #注意两个数据集的先后顺序,%in%表示前者中的变量与后者一一比较,返回布尔值
dim(exprSet)
exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]  #取得与注释包交集后的表达矩阵
dim(exprSet)

ids=ids[match(rownames(exprSet),ids$probe_id),]  #match相当于%in%,可以匹配返回逻辑值,也可以将前者数据的顺序匹配给后者
head(ids)
exprSet[1:5,1:5]

identical(ids$probe_id,rownames(exprSet))  #identical判断两个对象是否一致
dat=exprSet
ids$median=apply(dat,1,median) #ids新建median这一列,列名为median,同时对dat这个矩阵按行操作,取每一行的中位数,将结果给到median这一列的每一行
ids=ids[order(ids$symbol,ids$median,decreasing = T),]#先对ids$symbol排序,在此基础上按照ids$median中位数从大到小排列的顺序排序,将对应的行赋值为一个新的ids
ids=ids[!duplicated(ids$symbol),]#将symbol这一列取取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果
dat=dat[ids$probe_id,] #新的ids取出probe_id这一列,将dat按照取出的这一列中的每一行组成一个新的dat
rownames(dat)=ids$symbol#把ids的symbol这一列中的每一行给dat作为dat的行名
dat[1:4,1:4]  #保留每个基因ID第一次出现的信息
dim(dat)

df <- dat[rownames(dat)== "TP53",]
identical(colnames(dat),rownames(pdata))
df <- data.frame(expression = df,group = group_list)

library(ggpubr)
p <- ggboxplot(df, x = "group", y = "expression",
               color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
               add = "jitter", shape = "group")+
  stat_compare_means(label.y = 4.0,method = "t.test") +
  ylab("relative expression of TP53")
p
与第一种方法相比,第二种方法的组别应转为因子,并设定对照stable
library(dplyr)
data(sCLLex)
class(sCLLex)  #查看sCLLex是什么对象,决定了怎么提取其中的数据
eSet <- sCLLex
rm(sCLLex)
exprSet <- exprs(eSet)
pdata <- pData(eSet)
str(pdata)  #查看数据框变量特征后知道pdata$Disease为因子,但stable不是对照水平
group_list <- relevel(pdata$Disease,ref = "stable")  #用relevel()将stable设定为对照水平

gpl <- eSet@annotation  #用@来取出eSet对象中的注释数据
if (!require(hgu95av2.db)) BiocManager::install("hgu95av2.db")
library(hgu95av2.db)
ls("package:hgu95av2.db")
ids <- toTable(hgu95av2SYMBOL)

dat <- exprSet%>%
  as.data.frame()%>%
  mutate(probe_id=rownames(exprSet),median = apply(exprSet,1,median))
identical(dat$probe_id,rownames(exprSet))
dat <- merge(dat,ids,by = "probe_id")
dat <- dat[order(dat$symbol,dat$median,decreasing= "T"),]
table(sort(table(dat$symbol)))
dat <- dat[!duplicated(dat$symbol),]
rownames(dat) <- dat$symbol
dat <- dat[,-c(1,ncol(dat)-1,ncol(dat))]

identical(colnames(dat),rownames(pdata))
df <- as.numeric(dat[rownames(dat)== "TP53",])
df <- data.frame(expression = df,group = group_list)

library(ggpubr)
p <- ggboxplot(df, x = "group", y = "expression",
               color = "group", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
               add = "jitter", shape = "group")+
  stat_compare_means(label.y = 4.0,method = "t.test") +
  ylab("relative expression of TP53")
p
第三种方法ggpubr作图

找到BRCA1基因在TCGA数据库的乳腺癌数据集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表达情况

因网络问题,暂无解答。提示:使用http://www.cbioportal.org/index.do 定位数据集:http://www.cbioportal.org/datasets

找到TP53基因在TCGA数据库的乳腺癌数据集的表达量分组看其是否影响生存

提示使用:http://www.oncolnc.org/ 数据库

下载数据集GSE17215的表达矩阵并且提取下面的基因画热图

genes <- c("ACTR3B", "ANLN","BAG1","mBCL2","BIRC5", "BLVRA", "CCNB1", "CCNE1", "CDC20", "CDC6", "CDCA1", "CDH3", "CENPF", "CEP55", "CXXC5", "EGFR", "ERBB2", "ESR1", "EXO1", "FGFR4", "FOXA1", "FOXC1", "GPR160", "GRB7", "KIF2C", "KNTC2", "KRT14", "KRT17", "KRT5", "MAPT", "MDM2", "MELK", "MIA", "MKI67", "MLPH", "MMP11", "MYBL2", "MYC", "NAT1", "ORC6L", "PGR", "PHGDH", "PTTG1", "RRM2", "SFRP1", "SLC39A6", "TMEM45B", "TYMS", "UBE2C", "UBE2T")
rm(list=ls())
library(GEOquery)
library(stringr)
library(AnnoProbe)
library(ggplot2)

gse = "GSE17215"

if(require(AnnoProbe)){
  if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
  load(paste0(gse,"_eSet.Rdata"))
  eSet <- gset
  rm(gset)
}else{
  eSet <- getGEO(gse, 
                 destdir = '.', 
                 getGPL = F)
}

#(1)提取表达矩阵exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
exp = log2(exp+1)
boxplot(exp,cex.axis = 0.75,las = 2,col = rainbow(6))
#(2)提取临床信息
pd <- pData(eSet[[1]])
#(3)提取芯片平台编号
gpl <- eSet[[1]]@annotation

p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]

#(4)分组信息:
group_list <- ifelse(str_detect(pd$title,"paclitaxel"),"paclitaxel","salinomycin")

group_list=factor(group_list,levels = c("paclitaxel","salinomycin"))

#(5)下载注释信息:
if (!require(hthgu133a.db)) BiocManager::install("hthgu133a.db")
library(hthgu133a.db)
ls("package:hthgu133a.db")
ids <- toTable(hthgu133aSYMBOL)

#(6)数据处理:
table(rownames(exp)%in%ids$probe_id)
exp <- exp[rownames(exp)%in%ids$probe_id,]
identical(ids$probe_id,rownames(exp))

exp <- as.data.frame(exp)
exp$probe_id <- rownames(exp)
ids$median <- apply(exp,1,median)
ids <- ids[order(ids$symbol,ids$median,decreasing = T),]
ids <- ids[!duplicated(ids$symbol),]

exp <- merge(exp,ids,by="probe_id")
exp <- exp[match(ids$probe_id,exp$probe_id),]
identical(ids$probe_id,exp$probe_id)
rownames(exp) <- exp$symbol
exp <- exp[,-c(1,ncol(exp)-1,ncol(exp))]

dat <- exp[rownames(exp)%in%genes,]
dat <- as.matrix(dat)

library(RColorBrewer)
library(pheatmap)

identical(rownames(pd),colnames(dat))
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(dat) 

pheatmap(dat,scale = "row",
         cluster_cols = T,
         cluster_rows = T,
         show_rownames = T,
         show_colnames = F,
         color = colorRampPalette(rev(brewer.pal(n = 7, name =
                                                   "RdYlBu")))(100),
         annotation_col = annotation_col,
         fontsize = 8, #调整图片中文本的大小
         legend = T)

下载数据集GSE24673的表达矩阵计算样本的相关性并且绘制热图,需要标记上样本分组信息

rm(list=ls())
library(GEOquery)
library(stringr)
library(AnnoProbe)
library(ggplot2)

gse = "GSE24673"

if(require(AnnoProbe)){
  if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
  load(paste0(gse,"_eSet.Rdata"))
  eSet <- gset
  rm(gset)
}else{
  eSet <- getGEO(gse, 
                 destdir = '.', 
                 getGPL = F)
}

#(1)提取表达矩阵exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
#exp = log2(exp+1)
ncol(exp)
boxplot(exp,cex.axis = 0.75,las = 2,col = rainbow(11))
#(2)提取临床信息
pd <- pData(eSet[[1]])
#(3)提取芯片平台编号
gpl <- eSet[[1]]@annotation

p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]

#(4)分组信息:
group_list <- c(rep(c("139","535","984"),each = 3),rep("normal",2))

group_list=factor(group_list,levels = c("normal","139","535","984"))

#(5)下载注释信息:
if (!require(hugene10sttranscriptcluster.db)) BiocManager::install("hugene10sttranscriptcluster.db")
library(hugene10sttranscriptcluster.db)
ls("package:hugene10sttranscriptcluster.db")
ids <- toTable(hugene10sttranscriptclusterSYMBOL)

#(6)数据处理:
table(rownames(exp)%in%ids$probe_id)
exp <- exp[rownames(exp)%in%ids$probe_id,]
ids <- ids[ids$probe_id%in%rownames(exp),]
identical(ids$probe_id,rownames(exp))

ids$median <- apply(exp,1,median)
ids <- ids[order(ids$symbol,ids$median,decreasing = T),]
ids <- ids[!duplicated(ids$symbol),]

exp <- as.data.frame(exp)
exp$probe_id <- rownames(exp)
exp <- merge(exp,ids,by="probe_id")
exp <- exp[match(ids$probe_id,exp$probe_id),]
identical(ids$probe_id,exp$probe_id)
rownames(exp) <- exp$symbol
exp <- exp[,-c(1,ncol(exp)-1,ncol(exp))]

#画PCA图
dat=as.data.frame(t(exp))
library(FactoMineR)#画主成分分析图需要加载这两个包
library(factoextra) 

#pca的统一操作走起
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
                         geom.ind = "point", # show points only (nbut not "text")
                         col.ind = group_list, # color by groups
                         #palette = c("#00AFBB", "#E7B800"),
                         addEllipses = TRUE, # Concentration ellipses
                         legend.title = "Groups"
)
pca_plot  #浓度椭圆

pca_plot <- fviz_pca_ind(dat.pca,
                         geom.ind = "point", # show points only (nbut not "text")
                         col.ind = group_list, # color by groups
                         #palette = c("#00AFBB", "#E7B800"),
                         addEllipses = TRUE, 
                         ellipse.type = "confidence",
                         legend.title = "Groups"
)
pca_plot  #置信椭圆
#参见:http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/

#表达矩阵热图:
library(RColorBrewer)
library(pheatmap)

dat=as.data.frame(t(dat))
identical(rownames(pd),colnames(dat))
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(dat) 

pheatmap(dat,scale = "row",
         cluster_cols = T,
         cluster_rows = T,
         show_rownames = F,
         show_colnames = F,
         color = colorRampPalette(rev(brewer.pal(n = 7, name =
                                                   "RdYlBu")))(100),
         annotation_col = annotation_col,
         fontsize = 8,#调整图片中文本的大小
         legend = T)

#样本相关性热图:
M <- cor(exp)
pheatmap::pheatmap(M,annotation_col = annotation_col)  #(存疑)
PCA 表达矩阵热图 样本相关性矩阵

找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 对应的R的bioconductor注释包,并且安装它!

GPL6244
if (!require(hugene10sttranscriptcluster.db)) BiocManager::install("hugene10sttranscriptcluster.db")
library(hugene10sttranscriptcluster.db)
ls("package:hugene10sttranscriptcluster.db")
ids <- toTable(hugene10sttranscriptclusterSYMBOL)

下载数据集GSE42872的表达矩阵,并且分别挑选出 所有样本的(平均表达量/sd/mad/)最大的探针,并且找到它们对应的基因。

(注释包:hugene10sttranscriptcluster.db)

rm(list=ls())
#数据下载
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE42872"
if(require(AnnoProbe)){
  if(!file.exists(paste0(gse,"_eSet.Rdata"))) geoChina(gse)
  load(paste0(gse,"_eSet.Rdata"))
  eSet <- gset
  rm(gset)
}else{
  eSet <- getGEO(gse, 
                 destdir = '.', 
                 getGPL = F)
}
#(1)提取表达矩阵exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
#exp = log2(exp+1)
#(2)提取临床信息
pd <- pData(eSet[[1]])
#(3)调整pd的行名顺序与exp列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
#(4)提取芯片平台编号
gpl <- eSet[[1]]@annotation
save(gse,pd,exp,gpl,file = paste0(gse,"step1output.Rdata"))

#group_list(实验分组)和ids(芯片注释),每次都需要改
rm(list = ls())  
load(file = "GSE42872step1output.Rdata")
group_list=ifelse(str_detect(pd$title,"Control"),"control","treat")
#设置参考水平,对照在前,处理在后
group_list = factor(group_list,
                    levels = c("control","treat"))
#ids  BioconductorR包
gpl 
#http://www.bio-info-trainee.com/1399.html
if(!require(hugene10sttranscriptcluster.db))BiocManager::install("hugene10sttranscriptcluster.db")
library(hugene10sttranscriptcluster.db)
ls("package:hugene10sttranscriptcluster.db")
ids <- toTable(hugene10sttranscriptclusterSYMBOL)
head(ids)
save(exp,group_list,ids,file = paste0(gse,"step2output.Rdata"))

#数据处理:
table(rownames(exp)%in%ids$probe_id)
exp <- exp[rownames(exp)%in%ids$probe_id,]
ids <- ids[ids$probe_id%in%rownames(exp),]
identical(ids$probe_id,rownames(exp))

ids$median <- apply(exp,1,median) #中位数
ids$sd <- apply(exp,1,sd) #标准差
ids$mad <- apply(exp,1,mad) #中位方差
ids_median <- ids[order(ids$median,decreasing = F),]
median_max <- ids_median[1,]
ids_sd <- ids[order(ids$sd,decreasing = T),]
sd_max <- ids_sd[1,];sd_max
ids_mad <- ids[order(ids$mad,decreasing = T),]
mad_max <- ids_mad[1,];mad_max
所有样本的(平均表达量/sd/mad/)最大的探针及基因名

下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵

load(file = "GSE42872step1output.Rdata")
load("GSE42872step2output.Rdata")

table(rownames(exp)%in%ids$probe_id)
exp <- exp[rownames(exp)%in%ids$probe_id,]
identical(rownames(exp),ids$probe_id)
exp <- as.data.frame(exp)
exp$median <- apply(exp,1,median)
exp$probe_id <- rownames(exp)
exp_m <- merge(exp,ids,by="probe_id")

exp_m <- exp_m[order(exp_m$symbol,exp_m$median,decreasing = T),]
exp_m <- exp_m[!duplicated(exp_m$symbol),]
rownames(exp_m) <- exp_m$symbol
exp_m <- exp_m[,-c(1,ncol(exp_m)-1,ncol(exp_m))]

library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp_m,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)
save(exp_m,deg,file="GSE42872limma.Rdata")
上一篇下一篇

猜你喜欢

热点阅读