常用自定义函数

2022-05-30  本文已影响0人  郭师傅

1、log2转换判断执行

# 差异分析前,判断是否需要LOG2转换------------------------------------------
ex <- exprSet
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)

if (LogC) { ex[which(ex <= 0)] <- NaN
exprSet <- log2(ex)
print("log2 transform finished")}else{print("log2 transform not needed")}

2、差异分析结果标记(1)

# 定义加上下调标记函数,适用于limma包结果
add_sign_p <- function(df,lfc = 2,p = 0.05){
  df <- df %>% mutate(sign = dplyr::case_when(df$P.Value < p & df$logFC> lfc ~ "up",
                                              df$P.Value < p & df$logFC< -lfc ~ "down", 
                                              TRUE ~ "stable")
  )
  return(df)
}

add_sign <- function(df,lfc = 2,padj = 0.05){
  df <- df %>% mutate(sign = dplyr::case_when(df$adj.P.Val < padj & df$logFC> lfc ~ "up",
                                              df$adj.P.Val < padj & df$logFC< -lfc ~ "down",  
                                              TRUE ~ "stable"
  )
  )
  return(df)
}

diff_padj <- add_sign(df = diff_1,lfc = 1,padj = 0.05)
table(diff_padj$sign)
diffSig_padj <- diff_padj %>% filter(sign == "down" | sign == "up")

3、差异分析结果标记(2)

# 定义加上下调标记函数,适用于DEseq2包结果
add_sign_p <- function(df){
  df <- df %>% mutate(sign = dplyr::case_when(df$pvalue < p_cutoff & df$log2FoldChange> logFC_cutoff ~ "up",
                                              df$pvalue < p_cutoff & df$log2FoldChange< -logFC_cutoff ~ "down",
                                              TRUE ~ "stable")
  )
  return(df)
}

add_sign <- function(df){
  df <- df %>% mutate(sign = dplyr::case_when(df$padj < p_cutoff & df$log2FoldChange> logFC_cutoff ~ "up",
                                              df$padj < p_cutoff & df$log2FoldChange< -logFC_cutoff ~ "down",
                                              TRUE ~ "stable")
  )
  return(df)
}

4.表达矩阵PCA绘图

library(FactoMineR)
library(factoextra)
array_qc_pcaplot <- function(exprSet,group_list){
  dat.pca <- PCA(as.data.frame(t(exprSet)), graph = FALSE)
  fviz_pca_ind(dat.pca,
                            geom.ind = "point", # show points only (nbut not "text")
                            col.ind = group_list, # color by groups
                            # palette = c("#00AFBB", "#E7B800"),  #修改颜色
                            addEllipses = TRUE, # Concentration ellipses
                            # legend.title = "GROUP"
  ) + theme(
    legend.position = c(0.80, 1),
    text = element_text(size = 8),
    legend.key.size = unit(3, "mm"),
    legend.title = element_blank()
  )
}

5.基因注释

# 定义注释函数
# 两个参数:exprSet:表达矩阵,行名是探针
#           probe2symbol:注释文件,第一列探针,第二列sybol
annot <- function(exprSet,probe2symbol_df){
  # 开始注释
  colnames(probe2symbol_df) <- c("probe_id","symbol")
  exprSet <- as.data.frame(exprSet)  #change express matrix to dataframe
  exprSet$probe_id <- rownames(exprSet)  #make a new column of probe_id by rowname
  exprSet$probe_id <- as.character(exprSet$probe_id)
  #match probe_id and gene symbol
  exprSet <- exprSet %>% 
    inner_join(probe2symbol_df,by="probe_id") %>% #合并探针的信息
    dplyr::select(-probe_id) %>% #去掉多余信息
    dplyr::select(symbol, everything()) %>% #重新排列,
    mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% #求出平均数(这边的.真的是画龙点睛)
    arrange(desc(rowMean)) %>% #把表达量的平均值按从大到小排序
    distinct(symbol,.keep_all = T) %>% # symbol留下第一个
    dplyr::select(-rowMean) %>% #反向选择去除rowMean这一列
    tibble::column_to_rownames(colnames(.)[1]) # 把第一列变成行名并删除
  return(exprSet)
}

6.limma包差异基因分析

#表达矩阵:exprSet
#分组列表:group_list

# 1.制作design
group_list <- pdata$group
design <- model.matrix(~0+factor(group_list))
# design <- model.matrix(~group)
colnames(design) <- levels(factor(group_list))

# 2.制作比较矩阵
# makeContrasts里边组的顺序会影响结果
# 根据目前资料,实验组写左,对照组写右
group_list
cont.matrix<-makeContrasts(T2D-control,levels = design)
cont.matrix
#做完后实验组是1,对照组是-1,此处注释有待分析

# 定义DEG函数,适用于array,limma包
deg_limma <- function(exprSet,design = design,cont.matrix = cont.matrix){
  fit <- lmFit(exprSet_4,design)
  fit2=contrasts.fit(fit,cont.matrix)
  fit2 <- eBayes(fit2)  ## default no trend !!!
  ##eBayes() with trend=TRUE
  #通过修改p.value,改变输出基因,过小可能无输出。实际影响的是adj.p,输出基因的不同可能影响火山图的起点(根部)
  tempOutput = topTable(fit2,coef=1,n=Inf,adjust="BH")    
  #tempOutput1 = topTable(fit2,coef=1,n=Inf,adjust="fdr",p.value = 0.05) 
  nrDEG = na.omit(tempOutput)
  return(nrDEG)
}

diff_1 <- deg_limma(exprSet = exprSet,
                    design = design,
                    cont.matrix = cont.matrix)
上一篇下一篇

猜你喜欢

热点阅读