Perl学习笔记

clinical.json 数据中注释信息提取

2019-10-06  本文已影响0人  dming1024

在之前的基础上:TCGA clinical_data.json中临床信息的提取,对之前那的perl脚本进行改进。

#! /usr/bin/perl -w
unless(@ARGV ==2){
 die "usage: perl $0 <clinical.json> <fileout.txt> $!";
}

my($file1,$file2)=@ARGV;

my %hash;
open FILEIN, "$file1" || die "cannot open file : $!";
open FILEOUT, ">$file2" ||die "cannot write file $!";
while(<FILEIN>){
 chomp;
 if(/(\S+)\:\s(.*)\,/){
  if($2 =~ /\"(\S+)(\_diagnosis)\"$/){$hash{"new_id"} .= "\t$1"}
  else{$hash{$1} .="\t$2"};
}
}

close FILEIN;

foreach my $k (sort keys %hash){
print FILEOUT "$k \t $hash{$k}\n";
}
close FILEOUT;

这个是处理之后的脚本,相比之前,清爽了许多

      1 "age_at_diagnosis"              28714   22792   25300   22883   27506   28037   28919   29107   29441   27594   30178   1831
      2 "age_at_index"          78      62      69      62      75      76      79      79      80      75      82      50      72
      3 "ajcc_pathologic_m"             "M0"    "M0"    "M0"    "M0"    "M0"    "M0"    "M0"    "MX"    "M0"    "M0"    "M0"    "M1"
      4 "ajcc_pathologic_n"             "N1"    "N0"    "N0"    "N3a"   "N2"    "N0"    "N0"    "N3"    "N1"    "N0"    "N0"    "N1"
      5 "ajcc_pathologic_stage"                 "Stage IIIA"    "Stage IB"      "Stage II"      "Stage IIIB"    "Stage IIIB"    "Sta
      6 "ajcc_staging_system_edition"           "6th"   "7th"   "7th"   "7th"   "7th"   "7th"   "7th"   "7th"   "6th"   "7th"   "7th
      7 "alcohol_history"               "Not Reported"  "Not Reported"  "Not Reported"  "Not Reported"  "Not Reported"  "Not Reporte
      8 "alcohol_intensity"             null    null    null    null    null    null    null    null    null    null    null    null
      9 "bmi"           null    null    null    null    null    null    null    null    null    null    null    null    null    null
     10 "case_id"               "f72a26e8-7f96-4d86-b37b-7dc35f681133"  "6e03b415-84a1-4b91-8717-1a41edd4a255"  "9ef7582b-d4c1-4036-
     11 "cigarettes_per_day"            null    null    null    null    null    null    null    null    null    null    null    null
     12 "classification_of_tumor"               "not reported"  "not reported"  "not reported"  "not reported"  "not reported"  "not
     13 "created_datetime"              null    "2019-04-28T15:49:21.905058-05:00"      null    null    null    null    null    "201
     14 "days_to_birth"                 -28714  -22792  -25300  -22883  -27506  -28037  -28919  -29107  -29441  -27594  -30178  -183
     15 "days_to_death"                 113     359     661     24      284     476     439     300     52      570     356     300
     16 "days_to_diagnosis"             0       0       0       0       0       0       0       0       0       0       0       0
     17 "days_to_last_follow_up"                null    356     1072    11      838     754     null    null    0       23      694
     18 "days_to_last_known_disease_status"             null    null    null    null    null    null    null    null    null    null
     19 "days_to_recurrence"            null    null    null    null    null    null    null    null    null    null    null    null
     20 "days_to_treatment_end"                 null    null    null    null    null    null    null    null    null    null    null
上一篇 下一篇

猜你喜欢

热点阅读