脚本 | R | 每个基因在不同处理下表达量的相关系数
2022-03-07 本文已影响0人
shwzhao
通常求表达量的相关系数,是每两个样本间或每两个基因间的,最后得到相关性矩阵。但是我想求一个基因在不同处理下的相关系数,或者特定的一对基因之间相关系数,怎么办?
方法1. tidyverse
的rowwise()
来解决
更新:别用了,几万个基因来做,慢得想吐,我要疯了!
- 矩阵构建
> library(tidyverse)
> df <- tibble(geneid=paste("gene", 1:6, sep=""),
a = runif(6),
b = runif(6),
c = runif(6),
d = runif(6),
e = runif(6),
u = runif(6),
v = runif(6),
x = runif(6),
y = runif(6),
z = runif(6))
> df
# A tibble: 6 × 11
geneid a b c d e u v x y z
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 gene1 0.552 0.125 0.0159 0.960 0.506 0.708 0.336 0.658 0.380 0.312
2 gene2 0.00347 0.996 0.250 0.785 0.584 0.822 0.241 0.394 0.287 0.411
3 gene3 0.314 0.0522 0.922 0.127 0.678 0.486 0.531 0.554 0.519 0.400
4 gene4 0.498 0.594 0.883 0.000890 0.634 0.631 0.638 0.908 0.985 0.0191
5 gene5 0.368 0.797 0.697 0.117 0.513 0.817 0.371 0.0788 0.819 0.939
6 gene6 0.673 0.399 0.738 0.778 0.632 0.469 0.251 0.339 0.516 0.324
- 非常简单的计算过程
> df_cor <- df %>%
rowwise() %>%
mutate(a_cor = cor(c_across(a:e), c_across(u:z))) %>%
select(geneid, a_cor)
- 结果
> df_cor
# A tibble: 6 × 2
# Rowwise:
geneid a_cor
<chr> <dbl>
1 gene1 -0.239
2 gene2 -0.874
3 gene3 -0.163
4 gene4 -0.310
5 gene5 -0.706
6 gene6 0.760
- 检验
> cor(c(0.552,0.125,0.0159,0.960,0.506), c(0.708,0.336,0.658,0.380,0.312)); cor(c(0.00347,0.996,0.250,0.785,0.584), c(0.822,0.241,0.394,0.287,0.411)); cor(c(0.314,0.0522,0.922,0.127,0.678), c(0.486,0.531,0.554,0.519,0.400)); cor(c(0.498,0.594,0.883,0.000890,0.634), c(0.631,0.638,0.908,0.985,0.0191)); cor(c(0.368,0.797,0.697,0.117,0.513), c(0.817,0.371,0.0788,0.819,0.939)); cor(c(0.673,0.399,0.738,0.778,0.632), c(0.469,0.251,0.339,0.516,0.324))
[1] -0.2383802
[1] -0.8744735
[1] -0.1592421
[1] -0.3098914
[1] -0.7068544
[1] 0.7592879
方法2. 果然还是得group_by()
非常的快,我很喜欢!
- 分别建立基因表达矩阵,宽变长
> df1 <- tibble(geneid=paste("gene", 1:6, sep=""),
a = runif(6),
b = runif(6),
c = runif(6),
d = runif(6),
e = runif(6)) %>%
pivot_longer(-geneid,
names_to="Samples",
values_to="Values")
> df2 <- tibble(geneid=paste("gene", 1:6, sep=""),
a = runif(6),
b = runif(6),
c = runif(6),
d = runif(6),
e = runif(6)) %>%
pivot_longer(-geneid,
names_to="Samples",
values_to="Values")
> df1 %>%
left_join(df2, by=c("geneid", "Samples"))
# A tibble: 30 × 4
geneid Samples Values.x Values.y
<chr> <chr> <dbl> <dbl>
1 gene1 a 0.670 0.370
2 gene1 b 0.920 0.686
3 gene1 c 0.791 0.0458
4 gene1 d 0.750 0.293
5 gene1 e 0.397 0.987
6 gene2 a 0.233 0.840
7 gene2 b 0.288 0.871
8 gene2 c 0.640 0.805
9 gene2 d 0.309 0.303
10 gene2 e 0.709 0.947
# … with 20 more rows
- 也是非常简单的计算
> df1 %>%
left_join(df2, by=c("geneid", "Samples")) %>%
group_by(geneid) %>%
transmute(corV = cor(Values.x, Values.y)) %>%
distinct()
# A tibble: 6 × 2
# Groups: geneid [6]
geneid corV
<chr> <dbl>
1 gene1 -0.539
2 gene2 0.377
3 gene3 -0.0584
4 gene4 0.578
5 gene5 -0.668
6 gene6 0.340
- 检验
> cor(c(0.670,0.920,0.791,0.750,0.397), c(0.370,0.686,0.0458,0.293,0.987)); cor(c(0.233,0.288,0.640,0.309,0.709), c(0.840,0.871,0.805,0.303,0.947)); cor(c(0.684,0.300,0.0482,0.558,0.873), c(0.741,0.264,0.535,0.624,0.235)); cor(c(0.124,0.990,0.779,0.163,0.691), c(0.630,0.934,0.588,0.362,0.428)); cor(c(0.785,0.501,0.170,0.434,0.149), c(0.301,0.118,0.511,0.454,0.522)); cor(c(0.194,0.109,0.225,0.518,0.306), c(0.637,0.342,0.628,0.570,0.483))
[1] -0.5396631
[1] 0.3766399
[1] -0.05769485
[1] 0.5792299
[1] -0.6678438
[1] 0.3404695