比较基因组从入门到放弃(3)
2021-06-16 本文已影响0人
Morriyaty
今来做一哈基因家族收缩与扩张吼
用到的软件是 CAFE [基因课 提供的sif环境]
准备输入文件
orthofinder 文件夹中的 Orthogroups.GeneCount.tsv
mcmctree 生成的 超度量树
#修改树文件(这里应该没有空格)
cat FigTree.tre | sed 's/\[[^]]\+\]//g'| awk -F "=" '/UTREE/{print $2} ' > input.tree.nwk
sed -e 's/:/\n:/g' -e 's/\([),]\)/\n\1/g' input.tree.nwk |awk '{if($1~/:$/){printf ":"100*$2} else {printf $0}}' |sed 's/\s\+//g' > input.tree
#(a: 99.7080, (h: 96.6662, ((g: 7.1266, hu: 7.1266): 72.8073, (r: 72.4080, (ha: 29.3095, (ca: 24.8914, (m: 15.2361, ra: 15.2361): 9.6553): 4.4181): 43.0985): 7.5259): 16.7322): 3.0418);
#修改genecount文件
dos2unix Orthogroups.GeneCount.tsv
sed 's/[a-zA-Z0-9]\+$//' Orthogroups.GeneCount.tsv | awk '{print $1"\t"$0}' > input.tab
#运行CAFE
singularity exec PhyloTools.sif cafe5 --infile input.tab --tree FigTree.tre --output_prefix cafe_ortho --cores 10 -P 0.01
#得到某一物种快速进化的基因家族名称
cat Base_asr.tre | grep "=" | sed 's/(/ /g' | sed 's/)/ /g' | sed 's/,/ /g' | awk '{print$2,$11}' | grep "*" | awk '{print$1}' > x.all.list
#查看所有扩张的基因家族
cat Base_change.tab | awk '{print$1,$5}' | grep "OG" | grep "+" | grep -v "+0" | awk '{print$1}'> x.expand.list
#取交集
sort x.all.list x.expand.list | uniq -d > x.rapidlyexpand.list
#得到收缩的基因家族
cat Base_change.tab | awk '{print$1,$3}' | grep "OG" | grep "-" | awk '{print $1}' > ../acomys.contract.OGname.list
#得到该物种基因家族对应的基因名
python3 sample_Orthogroup.py Orthogroups.txt newortho.txt
cat newortho.txt | grep "|" | sed 's/x|//g' > x.txt
sh seq.sh | awk '{$1="";print $0}' | tr " " "\n" | sed '/^\s*$/d' | sort | uniq > xgene.list
(seq.sh:
grep OG0001243 x.txt
.....)
#得到某一物种扩张的基因家族的GO ID (显著扩张同理)
python3 OG2gene.py acomys.expand.OGname.list Orthogroups.txt ca acomys.expand.gene.list
python3 gene2GO.py acomys.expand.gene.list acomys-geneID2GoID.txt acomys.expand.go.list
cat acomys.expand.go.list | tr "," "\n" | sort | uniq > acomys.expand.finalgo.txt
实验室祖传pipeline
## step by step:
## (python2):
python2.7 cafetutorial_clade_and_size_filter.py -i unfiltered_cafe_input.txt -o filtered_cafe_input.txt -s # 其中输入文件是 input.tab 修改表头得到
## 运行两次 CAFE
#第一次输入文件为 filtered_cafe_input.txt 为了得到 lambda 值
#第二次输入文件为 large_filtered_cafe_input.txt 输入-l (L)参数指定 lambda 值
## 一般只描述第一次结果 第二次结果可以看看有没有需要
#OG2gene.py
'''
WYJ YouHahahaha
This script was used in cafe v5
Extract gene name based on OG name
usage: python3 this_script OG-list Orthogroups.txt species-name output
'''
import sys
f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
f3 = sys.argv[3]
f4 = open(sys.argv[4],'w')
og = f1.read().splitlines()
q = f3 + "|"
for line in f2:
line = line.strip()
n = line.split(":")[0]
if n in og:
f4.write(n+"\t")
x = line.split()[1:]
for i in x :
if q in i:
f4.write(i+"\t")
f4.write("\n")
f1.close()
f2.close()
f4.close()
'''
This script was uesd trans gene2GO
usage: python3 this_script file1 file2 output
file 1 :
OG0000000 evm.model.LG01.1 evm.model.LG02.1062 .....
file 2 :
evm.model.LG17.814 GO:0005125
'''
import sys
f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
f3 = open(sys.argv[3],'w')
d = []
p = {}
for line in f1:
x = line.strip().split()[1:]
for i in x:
if i not in d:
d.append(i)
for line in f2:
line = line.strip().split()
p[line[0]] = line[1]
for i in d:
if i in p:
x = p[i]
f3.write(x+"\n")
f1.close()
f2.close()
f3.close()
# 根据go.annotation.xls 得到 GO ID GO Name GO Category
cat cishu.go.annotation.xls | awk -F "\t" '{print$2}' | grep -v "^-" | sed 's/),/XXX/g' | sed 's/XXX/\n/g' | sed 's/(/\t/' | sed 's/)$//' | sort | uniq | grep -v "^B" |awk -F "\t" '{print$1"\t"$2"\t""Biological process"}' > GOID-NAME-BP.list
#CC MF 类似
cat GOID-NAME-BP.list GOID-NAME-CC.list GOID-NAME-MF.list > GOID-NAME-ALL.list
awk -F "\t" 'NR==FNR{a[$1]=$0; next} {print a[$1]}' GOID-NAME-ALL.list acomys.expand.finalgo.txt > acomys.expand.GOID.txt
后续富集工作不再赘述