lncRNA分析
2021-08-04 本文已影响0人
宗肃書
- 处理原始数据文件
cat V1-2_L4_160A60.R1.fastq1 V1-2_L4_160A60.R1.fastq2 > V1-2_L4_160A60.R1.fastq #合并补测的数据
cat V1-2_L4_160A60.R2.fastq1 V1-2_L4_160A60.R2.fastq2>V1-2_L4_160A60.R2.fastq
cat V1-4_L4_162A62.R1.fastq1 V1-4_L4_162A62.R1.fastq2 V1-4_L4_162A62.R1.fastq
cat V1-4_L4_162A62.R1.fastq1 V1-4_L4_162A62.R1.fastq2 > V1-4_L4_162A62.R1.fastq
cat V1-4_L4_162A62.R2.fastq1 V1-4_L4_162A62.R2.fastq2 >V1-4_L4_162A62.R2.fastq
mv V0-1_L4_157A57.R1.fastq.gz V0-1.R1.fastq.gz
mv V0-1_L4_157A57.R2.fastq.gz V0-1.R2.fastq.gz
mv V0-3_L4_158A58.R1.fastq.gz V0-3.R1.fastq.gz
mv V0-3_L4_158A58.R2.fastq.gz V0-3.R2.fastq.gz
mv V0-4_L4_159A59.R1.fastq.gz V0-4.R1.fastq.gz
mv V0-4_L4_159A59.R2.fastq.gz V0-4.R2.fastq.gz
mv V1-2_L4_160A60.R1.fastq.gz V1-2.R1.fastq.gz
mv V1-2_L4_160A60.R2.fastq.gz V1-2.R2.fastq.gz
mv V1-3_L4_161A61.R1.fastq.gz V1-3.R1.fastq.gz
mv V1-3_L4_161A61.R2.fastq.gz V1-3.R2.fastq.gz
mv V1-4_L4_162A62.R1.fastq.gz V1-4.R1.fastq.gz
mv V1-4_L4_162A62.R2.fastq.gz V1-4.R2.fastq.gz
- rawdata质控
mkdir cleandata
ls `pwd`/*.R1.fastq.gz > rawdata_fastq_1
ls `pwd`/*.R2.fastq.gz > rawdata_fastq_2
paste rawdata_fastq_1 rawdata_fastq_2> rawdata_fastq
------------------------------------------------------------------------
vim qc.sh
cat rawdata_fastq|while read id
do
arr=(${id})
fq1=${arr[0]}
fq2=${arr[1]}
trim_galore -q 25 --phred33 --length 36 -e 0.1 --stringency 3 --paired -o cleandata/ $fq1 $fq2 &
done
chmod a+x qc.sh
nohup ./qc.sh
- 统计cleandatabase
mkdir cleandataBase
ls `pwd`/*.R1_val_1.fq.gz > cleandata_fastq_1
ls `pwd`/*.R2_val_2.fq.gz > cleandata_fastq_2
find *_1.fq.gz >sample
sed -i "s/.R1_val_1.fq.gz/ /g" sample
paste cleandata_fastq_1 cleandata_fastq_2 sample> cleandata_fastq
vim fc.sh
cat cleandata_fastq|while read id
do
arr=(${id})
fq1=${arr[0]}
fq2=${arr[1]}
sample=${arr[2]}
fastq-count $fq1 $fq2 >cleandataBase/$sample &
done
chmod a+x fc.sh
nohup ./fc.sh
cd cleandataBase
cat * > cleandataBase
sed -i 's/}/ /g' cleandataBase
sed -i 's/:/\t/g' cleandataBase
sed -i 's/,/\t/g' cleandataBase
cat cleandataBase | awk '{print $2}' > cleandatabases
cat cleandataBase | awk '{print $4}' > cleandatareads
cat cleandatabases
cat cleandatareads
- 建立索引
cd /public/jychu/reference/index/hisat/sheep
wget http://ftp.ensembl.org/pub/release-104/fasta/anser_brachyrhynchus/dna/Anser_brachyrhynchus.ASM259213v1.dna.toplevel.fa.gz
gzip -d Anser_brachyrhynchus.ASM259213v1.dna.toplevel.fa.gz #解压参考基因组文件
mkdir index #新建一个存放构建参考索引的目录
hisat2-build Anser_brachyrhynchus.ASM259213v1.dna.toplevel.fa index/index #构建以index为前缀的索引文件,并存放到index目录下
cd /public/jychu/reference/annotate/goose
wget http://ftp.ensembl.org/pub/release-104/gtf/anser_brachyrhynchus/Anser_brachyrhynchus.ASM259213v1.104.gtf.gz
gzip -d Anser_brachyrhynchus.ASM259213v1.104.gtf.gz
- hisat2比对
cd /public/jychu/lncRNA-goose/cleandata
mkdir bam
ls `pwd`/*_1.fq.gz > cleandata_fastq_1
ls `pwd`/*_2.fq.gz > cleandata_fastq_2
find *_1.fq.gz >sample
sed -i "s/.R1_val_1.fq.gz/ /g" sample
paste cleandata_fastq_1 cleandata_fastq_2 sample> cleandata_fastq
vim compare.sh
INDEX='/public/jychu/reference/index/hisat/goose/index/index'
cat cleandata_fastq|while read id
do
arr=(${id})
fq1=${arr[0]}
fq2=${arr[1]}
sample=${arr[2]}
hisat2 -p 8 -x $INDEX -1 $fq1 -2 $fq2 | samtools view -S -b - > bam/$sample.bam &
done
chmod a+x compare.sh
nohup ./compare.sh
- 排序
cd bam
ls *.bam|while read id
do
samtools sort -@ 4 $id -o ${id%.*}.sorted
rm -f ${id%.*}.bam
done
- 统计比对率
mkdir flagstat
ls *.bam > flagstat_bam
find *.bam>bam_sample
sed -i "s/.sorted.bam/ /g" bam_sample
paste flagstat_bam bam_sample >flagstatbam_list
vim flagstatbam.sh
cat flagstatbam_list |while read id
do
arr=(${id})
bam=${arr[0]}
sample=${arr[1]}
samtools flagstat $bam > flagstat/$sample.flagstat.txt &
done
chmod a+x flagstatbam.sh
nohup ./flagstatbam.sh
cd flagstat
cat * > flagstat
grep "0 mapped" flagstat > flagstats
cat flagstats | awk '{print $5}' > flag
sed -i "s/(/ /g" flag
sed -i "s/%/ /g" flag
cat flag
- stringTie组装
mkdir gtfs
for i in `ls *.sorted`;
do new=${i%.*}.sorted.bam
mv "$i" "$new"
done
ls *.sorted.bam|while read id
do nohup stringtie $id -p 4 -G /public/jychu/reference/annotate/goose/Anser_brachyrhynchus.ASM259213v1.104.gtf -o gtfs/${id}.gtf -l ${id%.*} &
done
- 合并转录本
cd gtfs
mkdir merged
ls *.gtf > merge.list
stringtie --merge -G /public/jychu/reference/annotate/goose/Anser_brachyrhynchus.ASM259213v1.104.gtf merge.list -o merged/merged.gtf
- 定量
cd /public/jychu/lncRNA-goose/cleandata/bam
mkdir ballgown
ls *sorted.bam|while read id
do stringtie -e $id -p 4 -G gtfs/merged/merged.gtf -o ballgown/${id%%.*}.bg.gtf -b ballgown/${id}_out_dir
done #应该是为了取所有样本都注释到的转录本进行定量
#制作转录本表达矩阵 用ballgrown的transcript_fpkm = texpr(bg, 'FPKM') 可以直接得出表达矩阵,不过需要把编号和转录本ID对应替换一下
cd ballgown
mkdir quality
#获取每个样本的转录本ID
ls *.bg.gtf|while read id; do cat $id| awk '$3=="transcript"'|grep -Eo 'transcript_id \"\w+|transcript_id \"\w+\.\w+\.\w+'|cut -d\" -f 2 > quality/${id%%.*}.t; done
#获取每个样本的FPKM值
ls *.bg.gtf|while read id
do
less $id|awk '$3=="transcript"'|grep -Eo 'FPKM \"\w+\.\w+'|awk -F\" '{print$2}' > quality/${id%%.*}.value
done
cd quality
#合并每个样本转录本和对应的FPKM值
ls *.t|while read id; do paste ${id%%.*}.t ${id%%.*}.value > ${id%%.*}.FPKM ; done
#按转录本字母排序
ls *.FPKM|while read id;do less $id|sort -k 1 > ${id%.*}.sorted.FPKM;done
#提取样本的第一列并加列名
echo "transcript" > FPKM
less `ls *.sorted.FPKM|shuf -n1`|cut -f 1 >> FPKM
#提取每个样本的第二列并加列名
ls *.sorted.FPKM|while read id;do echo ${id%%.*} > ${id%%.*}.tmp;less $id|cut -f 2 >> ${id%%.*}.tmp;done
#合并转录ID列的文件以及所有样本的FPKM值
paste FPKM *.tmp > all.FPKM
#删除除了all.FPKM的所有文件
ls *.value *.t *.FPKM *.tmp FPKM|grep -v all.FPKM |xargs rm
#至此获得了转录本的表达矩阵 并下载至桌面上
- ballgrown差异分析
cd /public/jychu/lncRNA-goose/cleandata/bam/ballgown
conda activate dna
R
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ballgown")
library(ballgown)
bg = ballgown(samples = c("V0-1.sorted.bam_out_dir", "V0-3.sorted.bam_out_dir","V0-4.sorted.bam_out_dir","V1-2.sorted.bam_out_dir","V1-3.sorted.bam_out_dir","V1-4.sorted.bam_out_dir"),meas='all')
#查看转录本水平的表达量的代码示例如下
transcript_fpkm = texpr(bg, 'FPKM')
head(transcript_fpkm)
pData(bg) <- data.frame(id=sampleNames(bg),group=rep(c(1,0), each=3)) #分组
# 转录本水平的差异分析
stat_results = stattest(bg,feature='transcript',meas='FPKM',covariate='group')
write.csv(stat_results, "ballgrown_transcript_results.csv",row.names=FALSE)
------------------------------------------------------------------
cd /public/jychu/lncRNA-goose/cleandata/bam/ballgown/V0-1.sorted.bam_out_dir
cat t_data.ctab|cut -f2-8>test #下载到桌面上
-
初步筛选lncRNA#
### compare to reference transcripts, parameter M refer to remove single exon transcripts
cd /public/jychu/lncRNA-goose/cleandata/bam/gtfs/merged
mkdir Identification
cd Identification
gffcompare -R -M -r /public/jychu/reference/annotate/goose/Anser_brachyrhynchus.ASM259213v1.104.gtf -s /public/jychu/reference/index/hisat/goose/Anser_brachyrhynchus.ASM259213v1.dna.toplevel.fa ../merged.gtf
## class code filteration
#进入soft目录下载gtftools
cd ~/soft/
wget https://github.com/Dukunbioinfo/in-house-gtftools/archive/master.zip
unzip master.zip #生成in-house-gtftools-master文件夹
cd in-house-gtftools-master/
g++ *.cpp -o inhouseGTFtools #安装,得到一个绿色的可执行文件
~/soft/in-house-gtftools-master/inhouseGTFtools STscreen -classCode u -i gffcmp.annotated.gtf > u.gtf #代表基因间
~/soft/in-house-gtftools-master/inhouseGTFtools STscreen -classCode i -i gffcmp.annotated.gtf > i.gtf #代表内含子
~/soft/in-house-gtftools-master/inhouseGTFtools STscreen -classCode x -i gffcmp.annotated.gtf > x.gtf #代表反义
cat i.gtf u.gtf x.gtf > iux.gtf
#得到read序列长度
#下载gffread
conda install gffread -y
# read sequence
gffread -w iux.fa -g /public/jychu/reference/index/hisat/goose/Anser_brachyrhynchus.ASM259213v1.dna.toplevel.fa iux.gtf
samtools faidx iux.fa
# length filteration #转录本的长度是指转录本中所有外显子加起来的长度
less iux.fa.fai |awk '$2>200{print$1}' > iuxL200.MSTRG #筛选长度大于200的转录本
## 根据FPKM值过滤
cat iuxL200.MSTRG | tr "." "\t" > iuxL200.tmp
cat iuxL200.tmp | while read id;
do
arr=(${id})
MSTRG=${arr[0]}
N1=${arr[1]}
N2=${arr[2]}
grep -w "$MSTRG\.$N1\.$N2" ../../../ballgown/quality/all.FPKM >> iux200FPKM.tmp
done
cat iux200FPKM.tmp | awk '$2>0.1 || $3>0.1 || $4>0.1 || $5>0.1 || $6>0.1 || $7>0.1 {print$1}' > iux200LFPKM0.1.name #提取至少在一个样本里有表达量的转录本
cat iux200LFPKM0.1.name | tr "." "\t" > iux200LFPKM0.1.tmp
cat iux200LFPKM0.1.tmp | while read id;
do
arr=(${id})
MSTRG=${arr[0]}
N1=${arr[1]}
N2=${arr[2]}
grep -w "$MSTRG\.$N1\.$N2" gffcmp.annotated.gtf >> iux200LFPKM0.1.gtf
done
gffread -w iux200LFPKM0.1.fa -g /public/jychu/reference/index/hisat/goose/Anser_brachyrhynchus.ASM259213v1.dna.toplevel.fa iux200LFPKM0.1.gtf
rm -f *.tmp
-
鉴定lncRNA#
1.CPC2
先下载软件压缩包到桌面,地址(http://cpc2.gao-lab.org/data/CPC2-beta.tar.gz)
tar xzvf CPC2-beta.tar.gz
cd CPC2-beta/
ls
pwd
export CPC_HOME="/public/jychu/soft/CPC2-beta"
cd libs/libsvm
tar xzvf libsvm-3.18.tar.gz
cd libsvm-3.18
ls
make clean && make
1.1 使用
conda activate python2 #cpc2脚本只能在python2环境下使用
cd /public/jychu/lncRNA-goose/cleandata/bam/gtfs/merged/Identification
mkdir lncRNA
python /public/jychu/soft/CPC2-beta/bin/CPC2.py -i iux200LFPKM0.1.fa -o lncRNA/goose.cpc2 #运行命令
goose.cpc2.txt #输出文件
grep "noncoding" goose.cpc2.txt |cut -f1 >goose.cpc2.lnc
2.CPAT 使用方法http://rna-cpat.sourceforge.net/
pip install CPAT #下载软件
# 已有的文件在这个网址 https://sourceforge.net/projects/rna-cpat/files/v1.2.2/prebuilt_model/
#构建训练模型
1.CAPT预测
#非模式动物构建训练集
mkdir -p /public/jychu/reference/index/hisat/goose
wget http://ftp.ensembl.org/pub/release-103/fasta/anser_brachyrhynchus/cdna/Anser_brachyrhynchus.ASM259213v1.cdna.all.fa.gz
wget http://ftp.ensembl.org/pub/release-103/fasta/anser_brachyrhynchus/ncrna/Anser_brachyrhynchus.ASM259213v1.ncrna.fa.gz
conda install seqkit
grep "protein" Anser_brachyrhynchus.ASM259213v1.cdna.all.fa | sed "s/>//g" | tr " " "\t"|cut -f1 > cdna.name #提取cdna序列名称
cat Anser_brachyrhynchus.ASM259213v1.cdna.all.fa | seqkit grep -f cdna.name > cdna.name.fa #提取cdna名称对应的序列
grep "lncRNA" Anser_brachyrhynchus.ASM259213v1.ncrna.fa| sed "s/>//g" | tr " " "\t"|cut -f1 > lncRNA.name #提取lncRNA序列名称
cat Anser_brachyrhynchus.ASM259213v1.ncrna.fa | seqkit grep -f lncRNA.name > lncRNA.name.fa #提取lncRNA名称对应的序列
mkdir -p /public/jychu/soft/CPAT/goose
make_hexamer_tab.py -c cdna.name.fa -n lncRNA.name.fa >/public/jychu/soft/CPAT/goose/goose_Hexamer.tsv
make_logitModel.py -x ~/soft/CPAT/goose/goose_Hexamer.tsv -c cdna.name.fa -n lncRNA.name.fa -o /public/jychu/soft/CPAT/goose/goose
cpat.py -x /public/jychu/soft/CPAT/goose/goose_Hexamer.tsv -d /public/jychu/soft/CPAT/goose/goose.logit.RData -g iux200LFPKM0.1.fa -o lncRNA/goose.CAPT
awk '$2>=200' goose.CAPT.ORF_prob.tsv |cut -f1|tr "_" "\t"|cut -f1|sort|uniq > capt.alltrans #提取ORF长度大于200的所有转录本id
awk '$10>=0.3 && $2>=200' goose.CAPT.ORF_prob.tsv|cut -f1|tr "_" "\t"|cut -f1|sort|uniq > goose.CAPT.unlncRNA #提取编码潜力大于0.3的转录本id
cat goose.unlncRNA capt.alltrans|sort|uniq -u > goose.capt.lnc #提取编码潜力小于0.3,ORF长度大于200的转录本id
3.EMBOSS transeq 把核酸序列转换成氨基酸序列
网址:https://www.ebi.ac.uk/Tools/st/emboss_transeq/ 在线版本,缺点是序列文件最大为1M,最多序列数为500
把iux200LFPKM0.1.fa 下载到桌面
cd /public/jychu/lncRNA-goose/cleandata/bam/gtfs/merged/Identification
seqkit seq iux200LFPKM0.1.fa -w 0 > goose.iux200LFPKM0.1.fa #把序列转换成一行
wc -l goose.iux200LFPKM0.1.fa
8034 goose.iux200LFPKM0.1.fa
cd /mnt/c/Users/TE/Desktop/lncRNA/绍梅师姐数据
sed -n '1,250p' goose.iux200LFPKM0.1.fa >goose-1.iux200LFPKM0.1.fa
sed -n '251,450p' goose.iux200LFPKM0.1.fa >goose-2.iux200LFPKM0.1.fa
sed -n '451,650p' goose.iux200LFPKM0.1.fa >goose-3.iux200LFPKM0.1.fa
sed -n '651,858p' goose.iux200LFPKM0.1.fa >goose-4.iux200LFPKM0.1.fa
sed -n '859,890p' goose.iux200LFPKM0.1.fa >goose-5.iux200LFPKM0.1.fa
sed -n '891,1160p' goose.iux200LFPKM0.1.fa >goose-6.iux200LFPKM0.1.fa
sed -n '1161,1420p' goose.iux200LFPKM0.1.fa >goose-7.iux200LFPKM0.1.fa
sed -n '1421,1700p' goose.iux200LFPKM0.1.fa >goose-8.iux200LFPKM0.1.fa
sed -n '1701,1810p' goose.iux200LFPKM0.1.fa >goose-9.iux200LFPKM0.1.fa
sed -n '1811,2070p' goose.iux200LFPKM0.1.fa >goose-10.iux200LFPKM0.1.fa
sed -n '2071,2290p' goose.iux200LFPKM0.1.fa >goose-11.iux200LFPKM0.1.fa
sed -n '2291,2476p' goose.iux200LFPKM0.1.fa >goose-12.iux200LFPKM0.1.fa
sed -n '2477,2680p' goose.iux200LFPKM0.1.fa >goose-13.iux200LFPKM0.1.fa
sed -n '2681,3000p' goose.iux200LFPKM0.1.fa >goose-14.iux200LFPKM0.1.fa
sed -n '3001,3300p' goose.iux200LFPKM0.1.fa >goose-15.iux200LFPKM0.1.fa
sed -n '3301,3562p' goose.iux200LFPKM0.1.fa >goose-16.iux200LFPKM0.1.fa
sed -n '3563,3800p' goose.iux200LFPKM0.1.fa >goose-17.iux200LFPKM0.1.fa
sed -n '3801,4100p' goose.iux200LFPKM0.1.fa >goose-18.iux200LFPKM0.1.fa
sed -n '4101,4400p' goose.iux200LFPKM0.1.fa >goose-19.iux200LFPKM0.1.fa
sed -n '4401,4840p' goose.iux200LFPKM0.1.fa >goose-20.iux200LFPKM0.1.fa
sed -n '4841,5200p' goose.iux200LFPKM0.1.fa >goose-21.iux200LFPKM0.1.fa
sed -n '5201,5550p' goose.iux200LFPKM0.1.fa >goose-22.iux200LFPKM0.1.fa
sed -n '5551,5776p' goose.iux200LFPKM0.1.fa >goose-23.iux200LFPKM0.1.fa
sed -n '5777,6100p' goose.iux200LFPKM0.1.fa >goose-24.iux200LFPKM0.1.fa
sed -n '6101,6500p' goose.iux200LFPKM0.1.fa >goose-25.iux200LFPKM0.1.fa
sed -n '6501,6870p' goose.iux200LFPKM0.1.fa >goose-26.iux200LFPKM0.1.fa
sed -n '6871,7500p' goose.iux200LFPKM0.1.fa >goose-27.iux200LFPKM0.1.fa
sed -n '7501,8034p' goose.iux200LFPKM0.1.fa >goose-28.iux200LFPKM0.1.fa
每一个结果保存为txt格式
cat *.txt>goose.iux.ped #合并所有结果,上传至服务器/public/jychu/lncRNA-goose/cleandata/bam/gtfs/merged/Identification/lncRNA下
seqkit seq goose.iux.ped -w 0 >goose.iux.ped.fa #把序列转换成一行
grep -rn "MSTRG.13560.1" goose.iux.ped.fa #查看要去除的转录本的行数,转录本翻译成的蛋白长度不能超过100k
42493:>MSTRG.13560.1_1
42495:>MSTRG.13560.1_2
42497:>MSTRG.13560.1_3
42499:>MSTRG.13560.1_4
42501:>MSTRG.13560.1_5
42503:>MSTRG.13560.1_6
sed -i "42493,42504d" goose.iux.ped.fa #去除转录本和它的下一行序列行
step1 通过hmmmerspress 来把下载的pfam数据建库
使用conda安装Pfam_scan ,hmmpress包含在里面
conda install pfam_scan
下载pfam数据
cd database
mkdir pfam
cd pfam
wget ftp://ftp.ebi.ac.uk:21/pub/databases/Pfam/current_release/Pfam-A.hmm.gz
wget ftp://ftp.ebi.ac.uk:21/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz
gunzip *.gz
hmmpress Pfam-A.hmm #通过hmmerspress来把下载的数据建库
step2 用HMMER和pfam数据库比对,过滤比对上的序列(Evalue<1e-5)
cd /public/jychu/lncRNA-goose/cleandata/bam/gtfs/merged/Identification/lncRNA
nohup pfam_scan.pl -fasta goose.iux.ped.fa -dir /public/jychu/database/pfam -out goose.pfam_scan.out &
grep -v "#" goose.pfam_scan.out | grep -v "^\s*$" |awk '{if($13<1e-5){print$1}}' | awk -F "_" '{print$1}'|sort|uniq>pfam.coding.ID #提取有编码潜能的ID
cat pfam.coding.ID ../iux200LFPKM0.1.name|sort|uniq -u |grep -v "MSTRG.13560.1" >goose.pfam.lnc #得到不具有编码能力的转录本id,也就是lncRNA
step3 利用diamond将蛋白序列和nr-animal,uniRef100数据库比对,过滤比对上的序列(1e-5)
1.和nr-animal比对
~/soft/diamond/diamond blastp -d ~/database/nr/nr.animal -q goose.iux.ped.fa --sensitive --evalue 1e-5 -o goose.nr-animal_matches_fmt6.txt
cat goose.nr-animal_matches_fmt6.txt| awk '{if($3>80){print$1}}'|awk -F "_" '{print$1}'|sort|uniq>nr.coding.ID #选择比对上的序列片段相似性大于80%的作为有编码能力的
cat nr.coding.ID ../iux200LFPKM0.1.name|sort|uniq -u |grep -v "MSTRG.13560.1" >goose.nr.lnc
2.和uniRef100比对
cd ~/database
mkdir uniRef
cd uniRef
wget https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz
gunzip uniref100.fasta.gz #解压
~/soft/diamond/diamond makedb --in uniref100.fasta --db uniref100 #建立数据库索引
~/soft/diamond/diamond blastp -d ~/database/uniRef/uniref100 -q goose.iux.ped.fa --sensitive --evalue 1e-5 -o goose.uniref100_matches_fmt6.txt
cat goose.uniref100_matches_fmt6.txt| awk '{if($3>80){print$1}}'|awk -F "_" '{print$1}'|sort|uniq>uniref100.coding.ID
cat uniref100.coding.ID ../iux200LFPKM0.1.name|sort|uniq -u |grep -v "MSTRG.13560.1" >goose.uniref100.lnc
mkdir lnc
cp *.lnc lnc/ #把所有预测的lncRNA文件放在一个文件夹下面,便于下载和查找
cat *.lnc|sort |uniq -c|tr " " "\t"|cut -f7,8|awk '{if($1==5){print$2}}'>goose.lncRNA.lnc #重叠的lncID,也就是鉴定出来的lncRNA
- 画韦恩图
setwd("C:/Users/TE/Desktop/lncRNA/绍梅师姐数据/LNC")
V1 = read.table("goose.capt.lnc",header=F, sep="\t")
capt=V1$V1
V2 = read.table("goose.cpc2.lnc",header=F, sep="\t")
cpc2=V2$V1
V3 = read.table("goose.nr.lnc",header=F, sep="\t")
nr=V3$V1
V4 = read.table("goose.pfam.lnc",header=F, sep="\t")
pfam=V4$V1
V5 = read.table("goose.uniref100.lnc",header=F, sep="\t")
uniref=V5$V1
library(VennDiagram)
venn.diagram(list("CAPT" = capt,"CPC2" =cpc2,"NR" = nr,"Pfam" =pfam,"UniRef100" = uniref),height=6000,width=6000,resolution=840,
filename="VennPlot.tiff",col="white",fill = c("#7fffd4", "#ff69b4","#ffa07a","#7cfc00","#ff8c00"),
alpha = 0.55,fontfamily = "serif",fontface = "bold",
cat.col = "black",cat.cex=1.6,cat.dist = c(0.18,0.192,0.22,0.22,0.196),
cat.pos = c(0,330,250,115,25),cat.fontfamily = "serif")
mRNA鉴定的思路:完全匹配到粉脚鹅已有的参考基因组上mRNA的转录本定义为已知的mRNA
而lncRNA分为已知的lncRNA和新的lncRNA
- 把所有转录本注释分类为mRNA\lncRNA、新的鉴定的lncRNA等
cd /public/jychu/reference/annotate/goose
cat Anser_brachyrhynchus.ASM259213v1.104.gtf |awk '{if($3=="transcript"){print$0}}'|tr ";" "\t" |sort -k11|cut -f1,4,5,7,9,11,13-15|sed "s/gene_id //g"|sed "s/transcript_id //g"|sed "s/\"//g"|tr " " ":"|sed "s/:ENSABRT/ENSABRT/g"|sed "s/:gene/gene/g"|awk -F "\t" '{if($7=="gene_source:ensembl"){$7=$6}}{print$0}'|tr " " "\t"|awk -F "\t" '{if($8=="gene_source:ensembl"){$8=$9}}{print$0}'|tr " " "\t"|cut -f1-8|sed -e "s/gene_name://g;s/gene_biotype://g">goose.transcript.type
下载到桌面,用excel打开
- 新的lncRNA
cd /mnt/c/Users/TE/Desktop/lncRNA/绍梅师姐数据/LNC
cat *.lnc |sort|uniq -c|tr " " "\t"|cut -f7-8 |awk '{if($1==5){print$0}}'|cut -f2>novel.lnc
cat novel.lnc | while read id;
do
arr=(${id})
MSTRG=${arr[0]}
N1=${arr[1]}
N2=${arr[2]}
grep -w "$MSTRG\.$N1\.$N2" ../ballgown/alltranscript.txt >> novel.lnc.expression
done
- 取新的转录本对应的xloc编号
cd /public/jychu/lncRNA-goose/cleandata/bam/gtfs/merged/Identification
cat iux.gtf|awk '{if($3=="transcript"){print$0}}'>test.tmp
vi novel.lnc
Vim 的命令模式中输入 :%s/^M$//g 消除EXCEL模式
cat novel.lnc|tr "." "\t" >novel.lnc.tmp
vi test.sh
cat novel.lnc.tmp | while read id;
do
arr=(${id})
MSTRG=${arr[0]}
N1=${arr[1]}
N2=${arr[2]}
grep -w "$MSTRG\.$N1\.$N2" test.tmp >> novel.lnc.tmp.tmp
done
chmod a+x test.sh
./test.sh
cat novel.lnc.tmp.tmp|cut -f9|tr ";" "\t"|cut -f1,3- |sed "s/\"//g"|sed -e "s/transcript_id //g;s/xloc //g"|awk '{$NF=NULL;print}'|awk '{$NF=NULL;print}'|sed "s/class_code//g"|awk '{if($2=="gene_name"){print$1"\t"$4"\t"$7"\t"$2":"$3";"$5":"$6}else{print$1"\t"$2"\t"$3}}'>novel.lnc.test
下载到桌面上
rm -f *.tmp
- 差异表达的lncRNA顺式靶基因预测
先把差异lncRNA筛选出来
在坐标前后加减100kb
然后把所有的mRNA整理出来
-wa -wb 输出overlap的区域所在-a和-b中的原内容:
bedtools intersect -a lnc.bed.txt -b mRNA.bed.txt -wb |cut -f4,8|sort|uniq >cis.lnc #-wb会输出overlap的区域和其中-b文件中的内容
- 差异表达的lncRNA的反式靶基因
准备文件如下格式
R
data<-read.table(file="lnc-mRNA.fpkm",row.names=1,header=F)
corr<- round(cor(t(data)),3) #round是保留小数位的函数
head(corr[,1:16])
id = row.names(corr)
n = dim(corr)[1]
re = data.frame(ID1= rep(id,n),ID2=rep(id,each = n),r = round(as.vector(corr),3))
nrow(re) #查看有多少行
res=subset(re,re$r>0.95 | re$r< -0.95)
res1<-res[grep(pattern="lncRNA",res[,1]),] #提取第一列有lncRNA字符的行
res2<-res1[grep(pattern="protein_coding",res1[,2]),] #在前面的基础上提取第二列有protein_coding的行(也就是去掉lnc与lnc有相关的行)
write.table(res2,file="trans.lnc",sep="\t")
- 找有表达量的lncRNA做顺式反式靶基因预测
cd /mnt/c/Users/TE/Desktop/lncRNA/绍梅师姐数据/ballgown
bedtools intersect -a alllnc.bed -b mRNA.bed.txt -wa -wb |cut -f4,5,9,10>cis.alllnc
cd /mnt/c/Users/TE/Desktop/lncRNA/绍梅师姐数据/ballgown
awk '{print $1":"$2":"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9}' alllnc-mRNA.fpkm.txt >alllnc-mRNA.fpkm
R
data<-read.table(file="alllnc-mRNA.fpkm",row.names=1,header=F)
corr<- round(cor(t(data)),3) #round是保留小数位的函数
head(corr[,1:16])
id = row.names(corr)
n = dim(corr)[1]
re = data.frame(ID1= rep(id,n),ID2=rep(id,each = n),r = round(as.vector(corr),3))
nrow(re) #查看有多少行
res=subset(re,re$r>0.95 | re$r< -0.95)
res1<-res[grep(pattern="lncRNA",res[,1]),] #提取第一列有lncRNA字符的行
res2<-res1[grep(pattern="protein_coding",res1[,2]),] #在前面的基础上提取第二列有protein_coding的行(也就是去掉lnc与lnc有相关的行)
write.table(res2,file="trans.alllnc",sep="\t")