Linux与生物信息组学比较基因组

比较基因组从入门到放弃(3)

2021-06-16  本文已影响0人  Morriyaty

今来做一哈基因家族收缩与扩张吼
用到的软件是 CAFE [基因课 提供的sif环境]

准备输入文件
orthofinder 文件夹中的 Orthogroups.GeneCount.tsv
mcmctree 生成的 超度量树
#修改树文件(这里应该没有空格)
cat FigTree.tre  | sed  's/\[[^]]\+\]//g'| awk -F "=" '/UTREE/{print $2} '  > input.tree.nwk
sed  -e 's/:/\n:/g' -e 's/\([),]\)/\n\1/g'  input.tree.nwk  |awk '{if($1~/:$/){printf ":"100*$2} else {printf $0}}' |sed 's/\s\+//g' > input.tree
#(a: 99.7080, (h: 96.6662, ((g: 7.1266, hu: 7.1266): 72.8073, (r: 72.4080, (ha: 29.3095, (ca: 24.8914, (m: 15.2361, ra: 15.2361): 9.6553): 4.4181): 43.0985): 7.5259): 16.7322): 3.0418);
#修改genecount文件
dos2unix Orthogroups.GeneCount.tsv
sed 's/[a-zA-Z0-9]\+$//' Orthogroups.GeneCount.tsv | awk '{print $1"\t"$0}' > input.tab
#运行CAFE
singularity exec PhyloTools.sif cafe5 --infile input.tab --tree FigTree.tre --output_prefix cafe_ortho --cores 10 -P 0.01
#得到某一物种快速进化的基因家族名称
cat Base_asr.tre | grep "=" | sed 's/(/ /g' | sed 's/)/ /g' | sed 's/,/ /g' | awk '{print$2,$11}' | grep "*" | awk '{print$1}' > x.all.list
#查看所有扩张的基因家族
cat Base_change.tab | awk '{print$1,$5}' | grep "OG" | grep "+" | grep -v "+0"  | awk '{print$1}'> x.expand.list
#取交集
sort x.all.list x.expand.list | uniq -d > x.rapidlyexpand.list
#得到收缩的基因家族
cat Base_change.tab | awk '{print$1,$3}' | grep "OG" | grep "-" | awk '{print $1}' > ../acomys.contract.OGname.list
#得到该物种基因家族对应的基因名
python3 sample_Orthogroup.py Orthogroups.txt newortho.txt
cat newortho.txt | grep  "|" | sed 's/x|//g' > x.txt
sh seq.sh | awk '{$1="";print $0}' | tr " " "\n" | sed '/^\s*$/d' | sort | uniq > xgene.list
(seq.sh:
grep    OG0001243       x.txt
.....)
#得到某一物种扩张的基因家族的GO ID (显著扩张同理)
python3 OG2gene.py acomys.expand.OGname.list Orthogroups.txt ca acomys.expand.gene.list
python3 gene2GO.py acomys.expand.gene.list acomys-geneID2GoID.txt acomys.expand.go.list
cat acomys.expand.go.list | tr "," "\n" | sort | uniq > acomys.expand.finalgo.txt

实验室祖传pipeline

## step by step:
## (python2):
python2.7 cafetutorial_clade_and_size_filter.py -i unfiltered_cafe_input.txt -o filtered_cafe_input.txt -s  # 其中输入文件是 input.tab 修改表头得到
## 运行两次 CAFE
#第一次输入文件为 filtered_cafe_input.txt 为了得到 lambda 值
#第二次输入文件为 large_filtered_cafe_input.txt 输入-l (L)参数指定 lambda 值
## 一般只描述第一次结果 第二次结果可以看看有没有需要 
#OG2gene.py
'''
WYJ YouHahahaha
This script was used in cafe v5
Extract gene name based on OG name
usage: python3 this_script OG-list Orthogroups.txt species-name output 
'''

import sys

f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
f3 = sys.argv[3]
f4 = open(sys.argv[4],'w')

og = f1.read().splitlines()
q = f3 + "|"

for line in f2:
    line = line.strip()
    n = line.split(":")[0]
    if n in og:
        f4.write(n+"\t")
        x = line.split()[1:]
        for i in x :
            if q in i:
                f4.write(i+"\t")
        f4.write("\n")

f1.close()
f2.close()
f4.close()
'''
This script was uesd trans gene2GO
usage: python3 this_script file1 file2 output
file 1 : 
    OG0000000       evm.model.LG01.1        evm.model.LG02.1062 .....
file 2 :
    evm.model.LG17.814      GO:0005125
'''

import sys

f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
f3 = open(sys.argv[3],'w')

d = []
p = {}

for line in f1:
    x = line.strip().split()[1:]
    for i in x:
        if i not in d:
            d.append(i)

for line in f2:
    line = line.strip().split()
    p[line[0]] = line[1]

for i in d:
    if i in p:
        x = p[i]
        f3.write(x+"\n")

f1.close()
f2.close()
f3.close()
# 根据go.annotation.xls 得到 GO ID  GO Name GO Category
cat cishu.go.annotation.xls | awk -F "\t" '{print$2}' | grep -v "^-" | sed 's/),/XXX/g' | sed 's/XXX/\n/g' | sed 's/(/\t/' | sed 's/)$//' |  sort | uniq | grep -v "^B" |awk -F "\t" '{print$1"\t"$2"\t""Biological process"}' > GOID-NAME-BP.list
#CC MF 类似
cat GOID-NAME-BP.list GOID-NAME-CC.list GOID-NAME-MF.list  > GOID-NAME-ALL.list
awk -F "\t" 'NR==FNR{a[$1]=$0; next} {print a[$1]}'  GOID-NAME-ALL.list  acomys.expand.finalgo.txt  > acomys.expand.GOID.txt

后续富集工作不再赘述

上一篇 下一篇

猜你喜欢

热点阅读