Deseq2 分析RNA-seq的reads counts矩阵
2020-06-18 本文已影响0人
余绕
本数据为自己随意产生的,故分析时有提示!
library(DESeq2)
> rm(list=ls())
> count_data=read.table('DEseq.txt',header = TRUE,sep="\t")
> head(count_data)
Gene_id ev_1 ev_2 sample_1 sample_2 sample_3
1 gene1 53 32 97 59 27
2 gene2 24 3 74 2 73
3 gene3 46 5 88 34 42
4 gene4 75 58 34 21 1
5 gene5 90 65 25 1 53
6 gene6 89 99 96 12 86
> row.names(count_data)=count_data$Gene_id#把第一列变成行名
> head(count_data)
Gene_id ev_1 ev_2 sample_1 sample_2 sample_3
gene1 gene1 53 32 97 59 27
gene2 gene2 24 3 74 2 73
gene3 gene3 46 5 88 34 42
gene4 gene4 75 58 34 21 1
gene5 gene5 90 65 25 1 53
gene6 gene6 89 99 96 12 86
> count_tab=count_data[,-1] #去除第一列
> head(count_tab)
ev_1 ev_2 sample_1 sample_2 sample_3
gene1 53 32 97 59 27
gene2 24 3 74 2 73
gene3 46 5 88 34 42
gene4 75 58 34 21 1
gene5 90 65 25 1 53
gene6 89 99 96 12 86
> sample_Data=read.table('Phynotype.txt',header = TRUE,sep="\t")
> sample_Data
SampleID condition
1 ev_1 EV
2 ev_2 EV
3 sample_1 Sample
4 sample_2 Sample
5 sample_3 Sample
> sample_Data$condition=factor(sample_Data$condition,c("EV","Sample")) #设置分组顺序,这里因为E在S前,不设置也可以
> sample_Data
SampleID condition
1 ev_1 EV
2 ev_2 EV
3 sample_1 Sample
4 sample_2 Sample
5 sample_3 Sample
> dds<- DESeqDataSetFromMatrix(countData = count_tab,
+ colData =sample_Data,
+ design = ~condition)
converting counts to integer mode
> dds<-DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
> sizefactor<-sizeFactors(dds) #获得各组的size factors
> sizefactor
ev_1 ev_2 sample_1 sample_2 sample_3
1.151989 1.185014 1.117790 1.141435 1.141129
> resultsNames(dds)#list the coefficients,获得比较组
[1] "Intercept" "condition_Sample_vs_EV"
> res<-results(dds,name="condition_Sample_vs_EV")
> head(res)
log2 fold change (MLE): condition Sample vs EV
Wald test p-value: condition Sample vs EV
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
gene1 47.0279585924932 0.566053769686641 0.76867337531233 0.736403507480197 0.461485170730465 0.993661998703688
gene2 31.058228262017 1.91337507463314 1.31773242322843 1.45202094211615 0.146495782309875 0.993661998703688
gene3 37.8939636752357 1.13461160222758 1.03127890545406 1.10019859441227 0.271245602893762 0.993661998703688
gene4 32.7481554501855 -1.78404566313186 1.1589998211077 -1.53929761734283 0.12373165537549 0.993661998703688
gene5 40.5328677696536 -1.51709659185057 1.0826512214148 -1.40127915790649 0.161130611211371 0.993661998703688
gene6 66.5123710070533 -0.489985663564088 0.995504546061613 -0.492198318433156 0.622579153093233 0.993661998703688
> res<-res[order(res$padj),] #按照padjust value排序
> head(res)
log2 fold change (MLE): condition Sample vs EV
Wald test p-value: condition Sample vs EV
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
gene372 44.0121189310388 4.18981724397574 0.914268181788086 4.58270048923881 4.59009265192878e-06
gene206 28.0027885585295 5.75134738523254 1.39157480607446 4.13297751592436 3.58093768694533e-05
gene281 39.1423657824902 3.72547233692155 0.913615632247554 4.07772394147487 4.54787187421144e-05
gene490 46.5639545775845 3.83712142893487 0.940257744471528 4.08092509899137 4.48568095546619e-05
gene600 32.4274383805788 4.94359656208485 1.29720518270051 3.81095961380088 0.000138428383468241
gene504 29.2387021124829 4.78408878059009 1.36221357333734 3.51199611737048 0.000444754496479191
padj
<numeric>
gene372 0.0032543756902175
gene206 0.00806110289703977
gene281 0.00806110289703977
gene490 0.00806110289703977
gene600 0.0196291447757966
gene504 0.0525551563339577
> resDF<-as.data.frame(res)
> resDF$Gene_id=row.names(resDF)#创建geneID一列,产生的数据在最后一列
> head(resDF)
baseMean log2FoldChange lfcSE stat pvalue padj Gene_id
gene372 44.01212 4.189817 0.9142682 4.582700 4.590093e-06 0.003254376 gene372
gene206 28.00279 5.751347 1.3915748 4.132978 3.580938e-05 0.008061103 gene206
gene281 39.14237 3.725472 0.9136156 4.077724 4.547872e-05 0.008061103 gene281
gene490 46.56395 3.837121 0.9402577 4.080925 4.485681e-05 0.008061103 gene490
gene600 32.42744 4.943597 1.2972052 3.810960 1.384284e-04 0.019629145 gene600
gene504 29.23870 4.784089 1.3622136 3.511996 4.447545e-04 0.052555156 gene504
> resDF<-resDF[,c(7,1:6)]#把最后一列放在最前面
> head(resDF)
Gene_id baseMean log2FoldChange lfcSE stat pvalue padj
gene372 gene372 44.01212 4.189817 0.9142682 4.582700 4.590093e-06 0.003254376
gene206 gene206 28.00279 5.751347 1.3915748 4.132978 3.580938e-05 0.008061103
gene281 gene281 39.14237 3.725472 0.9136156 4.077724 4.547872e-05 0.008061103
gene490 gene490 46.56395 3.837121 0.9402577 4.080925 4.485681e-05 0.008061103
gene600 gene600 32.42744 4.943597 1.2972052 3.810960 1.384284e-04 0.019629145
gene504 gene504 29.23870 4.784089 1.3622136 3.511996 4.447545e-04 0.052555156
> final_data=merge(resDF,count_data,by="Gene_id") #合并数据,把处理前的readscounts合并进去
> head(final_data)
Gene_id baseMean log2FoldChange lfcSE stat pvalue padj ev_1 ev_2 sample_1 sample_2 sample_3
1 gene1 47.02796 0.5660538 0.7686734 0.7364035 0.4614852 0.993662 53 32 97 59 27
2 gene10 29.79123 0.6988787 1.2667329 0.5517175 0.5811419 0.993662 49 1 71 20 28
3 gene100 42.28342 -0.1866548 0.8440605 -0.2211391 0.8249841 0.993662 73 33 56 18 62
4 gene101 48.47256 -0.1330532 0.7450646 -0.1785794 0.8582679 0.993662 80 39 85 34 39
5 gene102 35.12121 0.7993091 0.9708976 0.8232682 0.4103555 0.993662 22 35 43 19 82
6 gene103 48.32332 0.7512332 0.6814179 1.1024560 0.2702635 0.993662 43 37 72 45 79
>