TCGA转录组数据及临床数据下载及整理
2021-07-26 本文已影响0人
萍智医信
一、TCGA转录组数据下载及整理
①进去先看cart是否为空,没有要清空。
②下载转录组数据
20210726163909.png基因表达量包括LncRNA和mRNA,用矫正后的FPKM
下载3个文件
20210726164909.png
③下载数据处理
下载压缩包先以该压缩包名称解压,然后用perl将各个文件夹中的压缩包汇成到一个文件夹内,perl代码如下
use strict;
#
use File::Copy;
my $newDir="files";
unless(-d $newDir){
mkdir $newDir or die $!;
}
opendir(RD, ".") or die $!;
my @allFiles=readdir(RD);
closedir(RD);
foreach my $subDir(@allFiles)
{
next if($subDir eq '.');
next if($subDir eq '..');
if((-d $subDir) && ($subDir ne $newDir))
{
opendir(SUB,"./$subDir") or die $!;
while(my $file=readdir(SUB))
{
if($file=~/\.gz$/)
{
#`cp ./$subDir/$file ./$newDir`;
copy("$subDir/$file","$newDir") or die "Copy failed: $!";
}
}
close(SUB);
}
}
然后再将文件中的压缩包解压,结果如下
每个样品基因表达量.png将所有文件合并,正常样本在前面,肿瘤样品在后面
操作方法:将metadata.cart文件和perl脚本放到上图每个样品基因表达量中,进行perl运算,并用txt文件记录正常和肿瘤样品数目
![9[WXJLT]X2F@SWXZ4FZZOO.png
perl代码如下
use strict;
#
my $file=$ARGV[0];
#use Data::Dumper;
use JSON;
my $json = new JSON;
my $js;
my %hash=();
my @normalSamples=();
my @tumorSamples=();
open JFILE, "$file";
while(<JFILE>) {
$js .= "$_";
}
my $obj = $json->decode($js);
my @samp1e=(localtime(time));
for my $i(@{$obj})
{
my $file_name=$i->{'file_name'};
my $file_id=$i->{'file_id'};
my $entity_submitter_id=$i->{'associated_entities'}->[0]->{'entity_submitter_id'};
$file_name=~s/\.gz//g;
if(-f $file_name)
{
my @idArr=split(/\-/,$entity_submitter_id);
if($idArr[3]=~/^0/)
{
push(@tumorSamples,$entity_submitter_id);
}
else
{
push(@normalSamples,$entity_submitter_id);
}
open(RF,"$file_name") or die $!;
while(my $line=<RF>)
{
next if($line=~/^\n/);
next if($line=~/^\_/);
chomp($line);
my @arr=split(/\t/,$line);
${$hash{$arr[0]}}{$entity_submitter_id}=$arr[1];
}
close(RF);
}
}
#print Dumper $obj
open(WF,">mRNAmatrix.txt") or die $!;
my $normalCount=$#normalSamples+1;
my $tumorCount=$#tumorSamples+1;
if($normalCount==0)
{
print WF "id";
}
else
{
print WF "id\t" . join("\t",@normalSamples);
}
print WF "\t" . join("\t",@tumorSamples) . "\n";
foreach my $key(keys %hash)
{
print WF $key;
foreach my $normal(@normalSamples)
{
print WF "\t" . ${$hash{$key}}{$normal};
}
foreach my $tumor(@tumorSamples)
{
print WF "\t" . ${$hash{$key}}{$tumor};
}
print WF "\n";
}
close(WF);
print "normal count: $normalCount\n";
print "tumor count: $tumorCount\n";
合并结果.png
④ID转换,转换成基因名
输入文件
![输入文件.png
perl 代码如下
use strict;
my $gtfFile="human.gtf";
my $expFile="mRNAmatrix.txt";
my $outFile="symbol.txt";
my %hash=();
open(RF,"$gtfFile") or die $!;
while(my $line=<RF>)
{
chomp($line);
if($line=~/gene_id \"(.+?)\"\;.+gene_name "(.+?)"\;.+gene_biotype \"(.+?)\"\;/)
{
$hash{$1}=$2;
}
}
close(RF);
open(RF,"$expFile") or die $!;
open(WF,">$outFile") or die $!;
my @samp1e=(localtime(time));
while(my $line=<RF>)
{
if($.==1)
{
print WF $line;
next;
}
chomp($line);
my @arr=split(/\t/,$line);
$arr[0]=~s/(.+)\..+/$1/g;
if(exists $hash{$arr[0]})
{
$arr[0]=$hash{$arr[0]};
print WF join("\t",@arr) . "\n";
}
}
close(WF);
close(RF);
转换结果.png
二、TCGA临床数据下载及整理
①先清空cart
Cases.pngFiles.png
②添加到cart,下载临床数据
20210729205432.png③提取临床信息
生存状态0为存活,1为死亡
结果图.pngperl代码如下
use strict;
#
use XML::Simple;
opendir(RD, ".") or die $!;
my @dirs=readdir(RD);
closedir(RD);
open(WF,">clinical.xls") or die $!;
print WF "Id\tfutime\tfustat\tAge\tGender\tGrade\tStage\tT\tM\tN\n";
foreach my $dir(@dirs){
#print $dir . "\n";
next if($dir eq '.');
next if($dir eq '..');
#print $dir . "\n";
if(-d $dir){
opendir(RD,"$dir") or die $!;
while(my $xmlfile=readdir(RD)){
if($xmlfile=~/\.xml$/){
#print "$dir\\$xmlfile\n";
my $userxs = XML::Simple->new(KeyAttr => "name");
my $userxml="";
if(-f "$dir/$xmlfile"){
$userxml = $userxs->XMLin("$dir/$xmlfile");
}else{
$userxml = $userxs->XMLin("$dir\$xmlfile");
}
# print output
#open(WF,">dumper.txt") or die $!;
#print WF Dumper($userxml);
#close(WF);
my $disease_code=$userxml->{'admin:admin'}{'admin:disease_code'}{'content'}; #get disease code
my $disease_code_lc=lc($disease_code);
my $patient_key=$disease_code_lc . ':patient'; #ucec:patient
my $follow_key=$disease_code_lc . ':follow_ups';
my $patient_barcode=$userxml->{$patient_key}{'shared:bcr_patient_barcode'}{'content'}; #TCGA-AX-A1CJ
my $gender=$userxml->{$patient_key}{'shared:gender'}{'content'}; #male/female
my $age=$userxml->{$patient_key}{'clin_shared:age_at_initial_pathologic_diagnosis'}{'content'};
my $race=$userxml->{$patient_key}{'clin_shared:race_list'}{'clin_shared:race'}{'content'}; #white/black
my $grade=$userxml->{$patient_key}{'shared:neoplasm_histologic_grade'}{'content'}; #G1/G2/G3
my $clinical_stage=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:clinical_stage'}{'content'}; #stage I
my $clinical_T=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:clinical_categories'}{'shared_stage:clinical_T'}{'content'};
my $clinical_M=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:clinical_categories'}{'shared_stage:clinical_M'}{'content'};
my $clinical_N=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:clinical_categories'}{'shared_stage:clinical_N'}{'content'};
my $pathologic_stage=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:pathologic_stage'}{'content'}; #stage I
my $pathologic_T=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:pathologic_categories'}{'shared_stage:pathologic_T'}{'content'};
my $pathologic_M=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:pathologic_categories'}{'shared_stage:pathologic_M'}{'content'};
my $pathologic_N=$userxml->{$patient_key}{'shared_stage:stage_event'}{'shared_stage:tnm_categories'}{'shared_stage:pathologic_categories'}{'shared_stage:pathologic_N'}{'content'};
$gender=(defined $gender)?$gender:"unknow";
$age=(defined $age)?$age:"unknow";
$race=(defined $race)?$race:"unknow";
$grade=(defined $grade)?$grade:"unknow";
$clinical_stage=(defined $clinical_stage)?$clinical_stage:"unknow";
$clinical_T=(defined $clinical_T)?$clinical_T:"unknow";
$clinical_M=(defined $clinical_M)?$clinical_M:"unknow";
$clinical_N=(defined $clinical_N)?$clinical_N:"unknow";
$pathologic_stage=(defined $pathologic_stage)?$pathologic_stage:"unknow";
$pathologic_T=(defined $pathologic_T)?$pathologic_T:"unknow";
$pathologic_M=(defined $pathologic_M)?$pathologic_M:"unknow";
$pathologic_N=(defined $pathologic_N)?$pathologic_N:"unknow";
my $survivalTime="";
my $vital_status=$userxml->{$patient_key}{'clin_shared:vital_status'}{'content'};
my $followup=$userxml->{$patient_key}{'clin_shared:days_to_last_followup'}{'content'};
my $death=$userxml->{$patient_key}{'clin_shared:days_to_death'}{'content'};
if($vital_status eq 'Alive'){
$survivalTime="$followup\t0";
}
else{
$survivalTime="$death\t1";
}
for my $i(keys %{$userxml->{$patient_key}{$follow_key}}){
eval{
$followup=$userxml->{$patient_key}{$follow_key}{$i}{'clin_shared:days_to_last_followup'}{'content'};
$vital_status=$userxml->{$patient_key}{$follow_key}{$i}{'clin_shared:vital_status'}{'content'};
$death=$userxml->{$patient_key}{$follow_key}{$i}{'clin_shared:days_to_death'}{'content'};
};
if($@){
for my $j(0..5){ #假设最多有6次随访
my $followup_for=$userxml->{$patient_key}{$follow_key}{$i}[$j]{'clin_shared:days_to_last_followup'}{'content'};
my $vital_status_for=$userxml->{$patient_key}{$follow_key}{$i}[$j]{'clin_shared:vital_status'}{'content'};
my $death_for=$userxml->{$patient_key}{$follow_key}{$i}[$j]{'clin_shared:days_to_death'}{'content'};
if( ($followup_for =~ /\d+/) || ($death_for =~ /\d+/) ){
$followup=$followup_for;
$vital_status=$vital_status_for;
$death=$death_for;
my @survivalArr=split(/\t/,$survivalTime);
if($vital_status eq 'Alive'){
if($followup>$survivalArr[0]){
$survivalTime="$followup\t0";
}
}
else{
if($death>$survivalArr[0]){
$survivalTime="$death\t1";
}
}
}
}
}
my @survivalArr=split(/\t/,$survivalTime);
if($vital_status eq 'Alive'){
if($followup>$survivalArr[0]){
$survivalTime="$followup\t0";
}
}
else{
if($death>$survivalArr[0]){
$survivalTime="$death\t1";
}
}
}
print WF "$patient_barcode\t$survivalTime\t$age\t$gender\t$grade\t$pathologic_stage\t$pathologic_T\t$pathologic_M\t$pathologic_N\n";
}
}
close(RD);
}
}
close(WF);