数据分析：质谱数据预处理

2021-06-28 本文已影响0人生信学习者2

前言

在使用MaxQuant软件处理Raw mass spectrometry data后，对输出结果“proteinGroup.txt”做数据过滤处理，得到可用于下游数据分析的profile。更多知识分享请到 https://zouhua.top/。

预处理过程包含：

数据清洗；
数据过滤
数据归一化和补充缺失值
生成ExpressionSet对象

读入数据

library(dplyr)
library(tibble)
library(data.table)

phen <- read.csv("phenotype_20210625.csv")
prof <- read.delim("MS_proteins_20210621.txt", stringsAsFactors = FALSE, colClasses = "character")
subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")

Uniprot数据库包含 Swiss-Prot和trEMBL两大数据库，其中前者是人工确认的可靠蛋白质数据库，后者是基于生物信息学预测的数据库，这里仅仅提取Swiss-Prot数据库的蛋白质用于分析，可根据"sp"或"tr"区分两大数据库

# select swiss-prot
prof <- prof[grep("^sp", prof$Fasta.headers, value = F), ] 

#prof2 <- prof[grep("^tr", prof$Fasta.headers, value = F), ]

数据清洗

该过程是为了清洗掉False hits，如Potential.contaminant，Reverse，Only.identified.by.site和Qvalue < 0.01
四类不符合要求的蛋白质数据。在清洗过程中，需要提取对应的ProteinID, Proteion Description以及GeneID, 这些信息都可在Fasta.headers列获取。

Data_Acquisition <- function(datset=prof){
  
  # datset=prof
  
  # Filter false hits
  colnames(datset) <- gsub("\\s+", ".", colnames(datset))
  dat <- datset %>% 
    filter(Potential.contaminant != "+") %>%
    filter(Reverse != "+") %>%
    filter(Only.identified.by.site != "+") %>%
    filter(Q.value < 0.01)
  
  # Extract Protein and Gene IDs
  # Isolate the first entry
  dat$Fasta.headers <- sub(";.*", "", dat$Fasta.headers)
  
  # Extract Protein name
  regex <- regexpr("(?<=_HUMAN.).*(?=.OS)", dat$Fasta.headers, perl = TRUE)
  dat$Protein.name <- regmatches(dat$Fasta.headers, regex)
  
  # Extract UniProtID
  regex <- regexpr("(?<=\\|).*(?=\\|)", dat$Fasta.headers, perl = TRUE)
  dat$Protein <- regmatches(dat$Fasta.headers, regex)
  
  # Extract Gene ID
  regex <- regexpr("((?<=\\|[[:alnum:]]{6}\\|).*(?=_HUMAN)|(?<=\\|[[:alnum:]]{10}\\|).*(?=_HUMAN))",
                  dat$Fasta.headers, perl = TRUE)
  dat$Gene <- regmatches(dat$Fasta.headers, regex)
  
  # Transform Intensity Columns
  # Extract names of intensity columns
  intensity.names <- grep("^LFQ.intensity", colnames(dat), value = TRUE)
  
  # df_new <- cbind(df[, c("Protein.name", "Protein", "Gene")])
  
  # Cast as numeric
  dat_intensity <- sapply(data.frame(dat)[, colnames(dat)%in%intensity.names], as.numeric)
  
  
  # Assign column names for log2-transformed data
  LOG.names <- sub("^LFQ.intensity.", "LOG2", intensity.names)   # rename intensity columns
  
  # Transform data
  dat_intensity_LOG <- log2(dat_intensity)
  colnames(dat_intensity_LOG) <- LOG.names
  res <- cbind(dat[, c("Protein", "Gene", "Protein.name")], 
                    dat_intensity_LOG,
                    dat_intensity)
  
  return(res)
}

prof_Acquisition <- Data_Acquisition(datset=prof)
if(!dir.exists("/Mass/")){
  dir.create("/Mass/")
}
write.table(prof_Acquisition, "/Mass/Mass_Proteins_approach2_Acquisition.tsv",
            row.names = F, sep = "\t", quote = F)

数据过滤

实验设计过程一般会对一组实验设置三个重复，如果蛋白质在单个组的重复次数仅有1次是不够提供足够的信息用于后续的比较分析的，这样的蛋白质可以认为是没有意义的。通常过滤原则是：同一条件下，三个重复至少出现两次才保留该蛋白质。

我们的数据远远多于三次重复，但基于线性回归的算法如limma的baye是需要三个以上的样本计算才较为可靠，因此我们设置过滤参数为每个条件下至少出现三次。

# Data filtering function
Data_Filtering <- function(metadata=phen, 
                           profile=prof_Acquisition,
                           group_name=subgrp,
                           min_count=c(3, 3, 3), 
                           at_least_one=TRUE){
  # metadata: data frame dictating the grouping
  # profile:  data frame containing LOG2 data for filtering and organized by data type  
  # min_count:  a numeric vector of the same length as "conditions" indicating the minimum 
  #     number of valid values for each condition for retention
  # at_least_one: TRUE means to keep the row if min_count is met for at least one condition
  #     FALSE means min_count must be met across all conditions for retention
  
  # metadata=phen_mass 
  # profile=prof_Acquisition
 #  group_name=subgrp
  # min_count=c(3, 3, 3) 
  # at_least_one=TRUE
  
  log2.names <- grep("^LOG2", colnames(profile), value = TRUE)   # Extract LOG2 column names
  prof <- profile %>% column_to_rownames("Gene") %>%
    dplyr::select(log2.names)
  colnames(prof) <- gsub("^LOG2", "", colnames(prof))
  mdat <- metadata %>% filter(Omics == "Exosome_Mass") %>%
    dplyr::select(PID, SampleID, Group) %>%
    inner_join(prof %>% t() %>% data.frame() %>%
                 rownames_to_column("SampleID"),
               by = "SampleID") %>%
    filter(Group%in%group_name) %>%
    mutate(SampleID_name=paste(SampleID, Group, sep = "_"))
  
  prof_cln <- mdat %>% dplyr::select(-all_of(c("PID","SampleID","Group"))) %>%
    column_to_rownames("SampleID_name") %>%
    t() %>% data.frame()
  log2.names.new <- colnames(prof_cln)
  
  cond.names <- lapply(group_name, # Group column names by group_name
                      function(x) grep(x, log2.names.new, value = TRUE, perl = TRUE))

  cond.filter <- sapply(1:length(cond.names), function(i) {
      df2 <- prof_cln[cond.names[[i]]]   # Extract columns of interest
      df2 <- as.matrix(df2)   # Cast as matrix for the following command
      sums <- rowSums(is.finite(df2)) # count the number of valid values for each condition
      sums >= min_count[i]   # Calculates whether min_count requirement is met
  })

  if(at_least_one){
    profile$KEEP <- apply(cond.filter, 1, any)
  }else{
    profile$KEEP <- apply(cond.filter, 1, all)
  }

  return(profile)  # No rows are omitted, filter rules are listed in the KEEP column
}

# Apply filtering
prof_filter_Raw <- Data_Filtering(
                       metadata=phen,
                       profile=prof_Acquisition,
                       group_name=subgrp,
                       min_count=c(3, 3, 3),
                       at_least_one=TRUE)

## Remove rows where KEEP is FALSE
prof_filter <- filter(prof_filter_Raw, KEEP)
if(!dir.exists("Mass/")){
  dir.create("/Mass/")
}
write.table(prof_filter, "/Mass/Mass_Proteins_approach2_filtering.tsv",
            row.names = F, sep = "\t", quote = F)

## Display filtered data frame
head(dplyr::select(prof_filter, Gene, starts_with("LOG2")))

数据归一化和填充

在填充缺失值之前，需要对数据进行归一化处理，目的是为了去除一些技术性变异。对每个样本取全局中位数，然后对应的蛋白质intensity值减去该中位数，最后获得归一化值。

缺失值通常和低丰度蛋白质联系在一起，可使用样本内最低丰度的蛋白质intensity作为缺失值的替代。在缺失值补充完整后，我们可以比较补充前后数据的分布情况。

## Data imputation function
Data_imputation <- function(
                        profile=prof_filter, 
                        width=0.3, 
                        downshift=1.8){
  # profile: data frame containing filtered 
  # Assumes missing data (in profile) follows a narrowed and downshifted normal distribution
  
  # profile=prof_filter
  # width=0.3
  # downshift=1.8
  
  ## Data normalization function
  Median_Centering <- function(dataset=prof_filter){
    # profile: data frame containing LOG2 columns for normalization
    
    # dataset=prof_filter
    
    LOG2.names <- grep("^LOG2", colnames(dataset), value = TRUE)
    dataset[, LOG2.names] <- lapply(LOG2.names, 
                              function(x) {
                                LOG2 = dataset[[x]]
                                LOG2[!is.finite(LOG2)] = NA   # Exclude missing values from median calculation
                                gMedian = median(LOG2, na.rm = TRUE)
                                LOG2 - gMedian
                              }
    )
  
    return(dataset)
  }


  ## Normalize data
  prof_Median <- Median_Centering(profile=prof_filter)  
  
  
  prof <- data.frame(prof_Median)
  LOG2.names <- grep("^LOG2", colnames(prof ), value = TRUE)
  impute.names <- sub("^LOG2", "impute", LOG2.names)

  # Create new columns indicating whether the values are imputed
  prof[, impute.names]  <- lapply(LOG2.names, function(x){!is.finite(prof[, x])})

  # Imputation
  set.seed(123)
  prof[, impute.names] <- lapply(LOG2.names,
                          function(x) {
                            temp <- prof[[x]]
                            temp[!is.finite(temp)] <- NA
                            temp.sd <- width * sd(temp[prof$KEEP], na.rm = TRUE)   # shrink sd width
                            temp.mean <- mean(temp[prof$KEEP], na.rm = TRUE) - 
                              downshift * sd(temp[prof$KEEP], na.rm = TRUE)   # shift mean of imputed values
                            n.missing <- sum(is.na(temp))
                            temp[is.na(temp)] <- rnorm(n.missing, mean = temp.mean, sd = temp.sd)                          
                            return(temp)
                          })
  return(prof)
}

## Apply imputation
prof_Impute <- Data_imputation(profile=prof_filter)
if(!dir.exists("/Mass/")){
  dir.create("/Mass/")
}
write.table(prof_Impute, "/Mass/Mass_Proteins_filtered_Impute.tsv",
            row.names = F, sep = "\t", quote = F)

## Display filtered data frame
head(dplyr::select(prof_Impute, Gene, starts_with("LOG2")))

比较处理前后，数据分布情况

Mass_ExprSet_RAW <- readRDS("Mass_Proteins_filtered_Normal_RAW.RDS")
Mass_ExprSet_LOG2RB <- readRDS("Mass_Proteins_filtered_Normal_LOG2RB.RDS")
Mass_ExprSet_LOG <- readRDS("Mass_Proteins_filtered_Normal_LOG.RDS")
Mass_ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")

get_distribution <- function(datset=Mass_ExprSet_LOG,
                             Type="LOG"){
  
  # datset=Mass_ExprSet_LOG
  # Type="LOG"
  
  if(Type == "LOG"){
    dat <- data.frame(exprs(datset)) %>% 
      rownames_to_column("name") %>%
      tidyr::gather(key="sample", value="value", -name)
    pl <- ggplot(dat, aes(x=value)) + 
            geom_histogram(colour="black", fill="white")+
            labs(title="Distribution of Log2 Transformed \nProtein Intensity", x="log2(LFQ Intensity)", y="Frequency")+
            theme_bw()    
  }else if(Type == "LOG2RB"){
    dat <- data.frame(exprs(datset)) %>% 
      rownames_to_column("name") %>%
      tidyr::gather(key="sample", value="value", -name)
    pl <- ggplot(dat, aes(x=value)) + 
            geom_histogram(colour="black", fill="white")+
            labs(title="Distribution of Log2 Transformed (RB)\n Protein Intensity", 
                 x="log2(LFQ Intensity) \n Relative abundance", y="Frequency")+
            theme_bw()    
  }else if(Type == "RAW"){
    dat <- data.frame(exprs(datset)) %>% 
      rownames_to_column("name") %>%
      tidyr::gather(key="sample", value="value", -name) %>%
      mutate(value=10^value - 1)
    pl <- ggplot(dat, aes(x=value)) + 
            geom_histogram(colour="black", fill="white")+
            # scale_y_continuous(expand = c(0, 0))+
            # scale_x_continuous(expand = c(0, 0))+
            labs(title="Distribution of Protein Intensity", x="LFQ Intensity", y="Frequency")+
            theme_bw()    
  }else if(Type == "LOG2Impute"){
    dat <- data.frame(exprs(datset)) %>% 
      rownames_to_column("name") %>%
      tidyr::gather(key="sample", value="value", -name)
    pl <- ggplot(dat, aes(x=value)) + 
            geom_histogram(colour="black", fill="white")+
            labs(title="Distribution of Log2 Imputation \nProtein Intensity", x="log2(LFQ Intensity) Imputation", y="Frequency")+
            theme_bw()
  }
  return(pl)
}

RAW_pl <- get_distribution(datset=Mass_ExprSet_RAW, Type="RAW")
LOG_pl <- get_distribution(datset=Mass_ExprSet_LOG, Type="LOG")
LOG2RB_pl <- get_distribution(datset=Mass_ExprSet_LOG2RB, Type="LOG2RB")
LOG2Impute_pl <- get_distribution(datset=Mass_ExprSet_LOG2Impute, Type="LOG2Impute")

cowplot::plot_grid(RAW_pl, LOG_pl, LOG2RB_pl, LOG2Impute_pl,
                   align = "hv", nrow = 2)

ExpressionSet Object

ExpressionSet 对象是一类包含多种信息的数据格式，存成该类型数据可以方便后面交流和分析，强烈建议大家。

get_Mass_ExprSet_v2 <- function(metadata=phen,
                                profile=prof_Impute){
  
  # metadata=phen
  # profile=prof_Impute
  
  index <- grep("LOG2MS", colnames(profile), value = T)
  prof <- profile %>% column_to_rownames("Gene") %>%
    dplyr::select(all_of(index))
  colnames(prof) <- gsub("LOG2", "", colnames(prof))
  metadata <- metadata %>% filter(Omics == "Exosome_Mass")
  sid <- intersect(metadata$SampleID, colnames(prof))
  phen_cln <- metadata[metadata$SampleID%in%sid, ] %>%
    column_to_rownames("SampleID")
  prof_cln <- prof %>% dplyr::select(rownames(phen_cln)) 
  
  if(!any(colnames(prof_cln) == rownames(phen_cln))){
    stop("The order of samplenames between phen_cln and prof_cln was wrong")
  }
  print(dim(prof_cln))
  require(convert)
  exprs <- as.matrix(prof_cln)
  adf <-  new("AnnotatedDataFrame", data=phen_cln)
  experimentData <- new("MIAME",
          name="Hua", lab="Dong gdl Lab",
          contact="Hua@grmh.cn",
          title="Disease Experiment",
          abstract="The Mass Spectrometry ExpressionSet with imputation value",
          url="www.grmh-gdl.cn",
          other=list(notes="The Data imputation of normal protein"))
  expressionSet <- new("ExpressionSet", exprs=exprs,
                         phenoData=adf, 
                         experimentData=experimentData)
  
  return(expressionSet)
}

Mass_ExprSet_LOG2Impute <- get_Mass_ExprSet_v2(metadata=phen, profile=prof_Impute)
if(!dir.exists("/Mass/")){
  dir.create("/Mass/")
}
saveRDS(Mass_ExprSet_LOG2Impute, "/Mass/Mass_Proteins_filtered_Normal_LOG2Impute.RDS",
        compress = T)
Mass_ExprSet_LOG2Impute

systemic information

sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] data.table_1.14.0   umap_0.2.7.0        vegan_2.5-6         lattice_0.20-41     permute_0.9-5       Rtsne_0.15         
 [7] convert_1.64.0      marray_1.68.0       limma_3.46.0        Biobase_2.50.0      BiocGenerics_0.36.0 ggplot2_3.3.3      
[13] tibble_3.1.0        dplyr_1.0.5        

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.6        tidyr_1.1.3       assertthat_0.2.1  digest_0.6.27     utf8_1.2.1        RSpectra_0.16-0  
 [7] R6_2.5.0          plyr_1.8.6        evaluate_0.14     pillar_1.6.0      rlang_0.4.10      jquerylib_0.1.3  
[13] Matrix_1.3-2      reticulate_1.18   rmarkdown_2.7     labeling_0.4.2    splines_4.0.2     stringr_1.4.0    
[19] munsell_0.5.0     tinytex_0.31      compiler_4.0.2    xfun_0.20         pkgconfig_2.0.3   askpass_1.1      
[25] mgcv_1.8-34       htmltools_0.5.1.1 openssl_1.4.3     tidyselect_1.1.0  fansi_0.4.2       crayon_1.4.1     
[31] withr_2.4.1       MASS_7.3-53.1     grid_4.0.2        nlme_3.1-150      jsonlite_1.7.2    gtable_0.3.0     
[37] lifecycle_1.0.0   DBI_1.1.1         magrittr_2.0.1    scales_1.1.1      stringi_1.4.6     farver_2.1.0     
[43] reshape2_1.4.4    bslib_0.2.4       ellipsis_0.3.1    generics_0.1.0    vctrs_0.3.7       cowplot_1.1.0    
[49] tools_4.0.2       glue_1.4.2        purrr_0.3.4       yaml_2.2.1        colorspace_2.0-0  cluster_2.1.0    
[55] knitr_1.31        sass_0.3.1