单细胞数据挖掘2-QC

2020-05-08  本文已影响0人  大吉岭猹

1. 写在前面

2. 搭建环境

3. 读入数据

# Create a merged Seurat object
merged_seurat <- merge(x = sigaf1,
                       y = c(sigag1,sigah1),
                       add.cell.id = c("sigaf1","sigag1","sigah1"))

4. 质控

4.1. 添加 metadata

# Add number of genes per UMI for each cell to metadata
merged_seurat$log10GenesPerUMI <-
  log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA)

# Compute percent mito ratio
merged_seurat$mitoRatio <- PercentageFeatureSet(object = merged_seurat, pattern = "^mt-")
merged_seurat$mitoRatio <- merged_seurat@meta.data$mitoRatio / 100
# 为了不改动 Seurat 对象,我们把 metadata 单独取出来添加信息
# Create metadata dataframe
metadata <- merged_seurat@meta.data
# Add cell IDs to metadata
metadata$cells <- rownames(metadata)

# Rename columns
metadata <- metadata %>%
  dplyr::rename(seq_folder = orig.ident,
                nUMI = nCount_RNA,
                nGene = nFeature_RNA)

# Create sample column
metadata$sample <- NA
metadata$sample[which(str_detect(metadata$cells, "^sigaf1_"))] <- "sigaf1"
metadata$sample[which(str_detect(metadata$cells, "^sigag1_"))] <- "sigag1"
metadata$sample[which(str_detect(metadata$cells, "^sigah1_"))] <- "sigah1"
metadata = metadata[,c(1,2,3,5,6,7,4)]

# Add metadata back to Seurat object
merged_seurat@meta.data <- metadata

# Create .RData object to load at any time
save(merged_seurat, file="data/merged_filtered_seurat.RData")

4.2. 针对双细胞

4.3. 细胞计数

# Visualize the number of cell counts per sample
metadata %>%
    ggplot(aes(x=sample, fill=sample)) +
    geom_bar() +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("NCells")

4.4. UMI counts (transcripts) per cell

# Visualize the number UMIs/transcripts per cell
metadata %>%
    ggplot(aes(color=sample, x=nUMI, fill= sample)) +
    geom_density(alpha = 0.2) +
    scale_x_log10() +
    theme_classic() +
    ylab("Cell density") +
    geom_vline(xintercept = 500)

4.5. Genes detected per cell

# Visualize the distribution of genes detected per cell via histogram
metadata %>%
    ggplot(aes(color=sample, x=nGene, fill= sample)) +
    geom_density(alpha = 0.2) +
    theme_classic() +
    scale_x_log10() +
    geom_vline(xintercept = 300)

# Visualize the distribution of genes detected per cell via boxplot
metadata %>%
    ggplot(aes(x=sample, y=log10(nGene), fill=sample)) +
    geom_boxplot() +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    theme(plot.title = element_text(hjust=0.5, face="bold")) +
    ggtitle("NCells vs NGenes")

4.6. UMIs vs. genes detected

# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs
metadata %>%
  ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) +
  geom_point() +
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() +
  scale_y_log10() +
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 300) +
  facet_wrap(~sample)

4.7. Mitochondrial counts ratio

# Visualize the distribution of mitochondrial gene expression detected per cell
metadata %>%
  ggplot(aes(color=sample, x=mitoRatio, fill=sample)) +
  geom_density(alpha = 0.2) +
  scale_x_log10() +
  theme_classic() +
  geom_vline(xintercept = 0.2)

4.8. Complexity

# Visualize the overall complexity of the gene expression by visualizing the genes detected per UMI
metadata %>%
  ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  geom_vline(xintercept = 0.8)

4.9. 细胞水平过滤

# Filter out low quality reads using selected thresholds - these will change with experiment
filtered_seurat <- subset(x = merged_seurat,
                          subset= (nUMI >= 500) &
                            (nGene >= 300) &
                            (log10GenesPerUMI > 0.80) &
                            (mitoRatio < 0.20))

4.10. 基因水平过滤

# Output a logical vector for every gene on whether the more than zero counts per cell
# Extract counts
counts <- GetAssayData(object = filtered_seurat, slot = "counts")

# Output a logical vector for every gene on whether the more than zero counts per cell
nonzero <- counts > 0

# Sums all TRUE values and returns TRUE if more than 10 TRUE values per gene
keep_genes <- Matrix::rowSums(nonzero) >= 10

# Only keeping those genes expressed in more than 10 cells
filtered_counts <- counts[keep_genes, ]

# Reassign to filtered Seurat object
filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = filtered_seurat@meta.data)

4.11. Re-assess QC metrics

友情宣传

上一篇 下一篇

猜你喜欢

热点阅读