plink 进行LD过滤
2023-06-23 本文已影响0人
bioshimmer
SNP过滤要进行LD过滤,本文用plink进行LD过滤。
版本:PLINK v1.90b7 64-bit (16 Jan 2023)
一、染色体格式转换和添加ID列
plink染色体只识别chr1,如果染色体为其它如scaffold1则会报错。需要更改染色体名字。
还需将vcf文件ID列设为chr_pos作为标识。使用如下命令:
python3 addvcfid.py input.vcf.gz outvcf.gz
脚本如下:
import gzip
import sys
if(len(sys.argv) < 3):
print("Usage:addvcfid.py infile outfile")
exit(1)
count=0
infilepath=sys.argv[1]
outfilepath=sys.argv[2]
infile=gzip.open(infilepath,"rt")
outfile=gzip.open(outfilepath,'wt')
for line in infile:
if '#' in line:
outfile.write(line)
elif 'HiC_scaffold_' in line:
line=line
line=line.replace('HiC_scaffold_','chr').rstrip()
b=line.split()
b[2]=b[0]+'_'+b[1]
outfile.write("\t".join(b)+"\n")
count+=1
else:
print(line)
infile.close()
outfile.close()
print("total have "+str(count)+" snps")
count=0
我这里是将HiC_scaffold_替换为chr,如果染色体是其它名字,请自行更改。效果如下:
image.png
二、vcf转为map格式
首先需要将vcf文件转为map文件
vcftools --gzvcf input.vcf.gz --plink --out ab.rename.plink
plink --noweb --file ab.rename.plink --recode12 --out ac.rename.plink12
三、计算LD
执行如下命令:
plink --file ac.rename.plink12 --indep-pairwise 100kb 1 0.5 --out ld
此时文件目录如下,ld.100kb.1.0.5.prune.in文件是要保留的位点。
current_dir.png
位点的格式即为vcf的ID列,下一步将这些位点提出即可。
ld.100kb.1.0.5.prune.in
四、提取位点
- 使用bcftools提取位点,需要将_替换为Tab即\t,参考这篇博文:
sed -e $'s/_/\t/g' ld.100kb.1.0.5.prune.in > ld.prune.bcftools.txt
https://www.cnblogs.com/mmtinfo/p/11945592.html
老版本bcftools提取速度很慢,bcftools 1.13版本测试还是挺快的。
-
自己写了一个脚本,大概25min,跟bcftools差不多。将vcf拆成染色体进行过滤。
draw_site_from_vcf.py -h
run.png
"""
author:bioshimmer
date:2023年6月29日
"""
import gzip
import os
import multiprocessing as mp
from functools import partial
import datetime
import argparse
import tempfile
def get_header_and_chrlist(vcffile,out_vcf):
print("Begin get the vcf header and chr number.")
in_vcf = gzip.open(vcffile,"rt")
headlist = []
chrlist = []
while True:
line = in_vcf.readline()
if line:
if '#' in line:
headlist.append(line)
else:
if line.split()[0] in chrlist:
pass
else:
chrlist.append(line.split()[0])
else:
break
in_vcf.close()
#print(chrlist)
with open(out_vcf,"w") as f:
for i in headlist:
f.write(i)
print("Success to get the vcf header and the chr list is :")
print(chrlist)
return chrlist
def vcf_chr_split(vcffile,tmp_path,chr_list):
#chr_list = ['chr'+ str(i) for i in range(1,14)]
print("Begin to split the vcf into tmp dirs by chr number.")
chr_file_dict = {}
for i in chr_list:
chr_file_dict[i] = open(tmp_path+'QJ.'+i+'.temp.vcf','w')
in_vcf_file = gzip.open(vcffile,"rt")
#out_vcf_file = open(outvcf,"w")
while True:
line = in_vcf_file.readline()
if line:
if '#' in line:
pass
else:
chr_pos = line.split()[0]
chr_file_dict[chr_pos].write(line)
else:
break
in_vcf_file.close()
for i in chr_file_dict.values():i.close()
print("Success to split the vcf into tmp dirs by chr number.")
def get_ld_dict(in_ld_pos,chr_list):
print("Get the ld dict.")
ld_dict = {}
for i in chr_list:
ld_dict[i] = []
with open(in_ld_pos,"r") as f:
for line in f.readlines():
ld_dict[line.rstrip().split('_')[0]].append(line.rstrip())
print("The ld dict is :")
print("chr\tsites number")
for i,j in ld_dict.items():
print(i+'\t'+str(len(j)))
return ld_dict
def ld_filter(ld_dict,tmp_path,chr_list,tmp_file):
print("Began to filter the "+tmp_file)
for chr_num in chr_list:
if chr_num == tmp_file.split('/')[-1].split('.')[1]:
f = open(tmp_file,'r')
out_file = tmp_path+chr_num+'.ld.filter.vcf'
outfile = open(out_file,'w')
while True:
line = f.readline()
if line:
if line.split()[2] in ld_dict[chr_num]:
outfile.write(line)
else:
pass
else:
break
f.close()
outfile.close()
break
print(chr_num+" filter done!")
#return chr_num+" filter done!"
def get_temp_path_files(tmp_path):
tmp_files = []
for i in os.listdir(tmp_path):
tmp_files.append(tmp_path+i)
print("toal "+str(len(tmp_files))+" temp files find.")
return tmp_files
def multi_process_run(cpu_num,ld_dict,chr_list,tmp_path,tmp_files):
print("Your cpu number is "+str(cpu_num))
pool = mp.Pool(cpu_num)
pfunc = partial(ld_filter,ld_dict,tmp_path,chr_list)
pool.imap(pfunc,tmp_files)
pool.close()
pool.join()
def file_combine(tmp_path,out_vcf):
print("files combine begin")
for i in range(1,14):
ld_filter_file = tmp_path+'chr'+str(i)+'.ld.filter.vcf'
f = open(ld_filter_file,'r')
outfile = open(out_vcf,'a+')
outfile.write(f.read())
f.close()
outfile.close()
print("files combine done!")
def main():
start_time = datetime.datetime.now()
parser = argparse.ArgumentParser()
parser.add_argument('-i','--input',type=str,help="input must be vcf.gz file.")
parser.add_argument('-o','--output',type=str,help="output file will in txt format.")
parser.add_argument('-p','--pos_file',type=str,help="the chr postion file you need to filter.")
parser.add_argument('-t','--cpu_num',type=int,help="the cpu number.",default=1)
#parser.add_argument('-d','--tmp_dir',type=str,help="the temp dirs for split file.",default=False)
args = parser.parse_args()
invcf = args.input
out_vcf = args.output
in_ld_pos = args.pos_file
#tmp_path = args.tmp_dir
cpu_num = args.cpu_num
tmp_path = tempfile.mkdtemp(dir=os.path.dirname(invcf))
chr_list = get_header_and_chrlist(vcffile=invcf,out_vcf=out_vcf)
ld_dict = get_ld_dict(in_ld_pos=in_ld_pos,chr_list=chr_list)
tmp_files = get_temp_path_files(tmp_path=tmp_path)
multi_process_run(cpu_num=cpu_num,ld_dict=ld_dict,chr_list=chr_list,tmp_path=tmp_path,tmp_files=tmp_files)
file_combine(tmp_path=tmp_path,out_vcf=out_vcf)
os.removedirs(tmp_path)
end_time = datetime.datetime.now()
print('totally time is '+str(end_time-start_time))
print('Thank you!')
if __name__ == '__main__':
main()