Perl学习笔记TCGA数据分析

TCGA clinical_data.json中临床信息的提取

2019-09-29  本文已影响0人  dming1024

上一次讲解了,如何使用perl脚本进行metadata中分组信息的提取,对于更详细的临床病例信息还需要从clinical_data.json中提取进行分析。当然,用R语言也可以,晚些我会把R的代码也加进去。

1.先来看下clinical_data.json数据结构
是一个array,hash相互嵌套的数据结构,我们要提取的有:
year_of_diagnosis
classification_of_tumor
last_known_disease_status
primary_diagnosis
等所有的这些信息,

1 [{
      2   "diagnoses": [
      3     {
      4       "year_of_diagnosis": 2007,
      5       "classification_of_tumor": "not reported",
      6       "last_known_disease_status": "not reported",
      7       "updated_datetime": "2019-08-08T17:35:29.350497-05:00",
      8       "primary_diagnosis": "Carcinoma, diffuse type",
      9       "submitter_id": "TCGA-MX-A5UG_diagnosis",
     10       "tumor_stage": "stage iiia",
     11       "age_at_diagnosis": 28714,
     12       "morphology": "8145/3",
     13       "days_to_last_known_disease_status": null,
     14       "created_datetime": null,
     15       "prior_treatment": "No",
     16       "ajcc_pathologic_n": "N1",
     17       "ajcc_pathologic_m": "M0",
     18       "state": "released",
     19       "days_to_last_follow_up": null,
     20       "days_to_recurrence": null,
     21       "diagnosis_id": "b566c2ed-3445-57f1-a432-97b53cda1733",
     22       "tumor_grade": "not reported",
     23       "treatments": [
     24         {
     25           "days_to_treatment_start": null,
     26           "updated_datetime": "2019-08-01T00:08:31.012165-05:00",
     27           "treatment_effect": null,
     28           "initial_disease_status": null,
     29           "treatment_type": "Pharmaceutical Therapy, NOS",
     30           "submitter_id": "TCGA-MX-A5UG_treatment_1",
     31           "treatment_id": "545fce87-ad86-5437-9b12-507644c3e28d",
     32           "created_datetime": "2019-04-28T15:49:21.905058-05:00",
     33           "state": "released",
     34           "therapeutic_agents": null,
     35           "regimen_or_line_of_therapy": null,
     36           "treatment_intent_type": null,
     37           "treatment_anatomic_site": null,
     38           "treatment_outcome": null,
     39           "days_to_treatment_end": null,
     40           "treatment_or_therapy": "no"
     41         },
     42         {
     43           "updated_datetime": "2019-08-01T00:08:31.012165-05:00",
     44           "created_datetime": null,
     45           "treatment_type": "Radiation Therapy, NOS",
     46           "submitter_id": "TCGA-MX-A5UG_treatment",
     47           "treatment_id": "f179fb0a-397e-53a7-a6b5-884b177402db",
     48           "state": "released",
     49           "therapeutic_agents": null,
     50           "treatment_intent_type": null,

2. 利用Perl提取信息
perl语言具有强大的处理文本功能,对于该段json文件,提取代码如下

#! /usr/bin/perl -w
my($file1,$file2)=@ARGV;
my $array;
my %hash;
open FILEIN, "$file1" || die "cannot open file : $!";
open FILEOUT, ">$file2" ||die "cannot write file $!";
while(<FILEIN>){
 chomp;
 if(/(\S+)\:\s(\S+)/){
  if($2 =~ /\"(\S+)(\_diagnosis\"\,)$/){$hash{new_id} .= $1}
  else{$hash{$1} .= $2};
}
}
close FILEIN;
foreach my $k (sort keys %hash){
print FILEOUT "$k $hash{$k}\n";
}
close FILEOUT;

3. 结果
获得所有的临床信息,就可以根据所需进一步选择就可以了

 perl annotation2.pl clinical.cart.2019-09-28.json x3.txt
 cat x3.txt | less -SN
      1 "age_at_diagnosis"       28714,22792,25300,22883,27506,28037,28919,29107,29441,27594,30178,18318,26474,27549,32024,25506,28791,21936,25913,26290,25637,20700,21753,21244,2849
      2 "age_at_index"   78,62,69,62,75,76,79,79,80,75,82,50,72,75,87,69,78,60,70,71,70,56,59,58,78,66,70,64,66,78,64,44,58,63,55,74,59,62,66,54,77,63,45,72,83,60,86,86,57,43,69,84,
      3 "ajcc_pathologic_m"      "M0","M0","M0","M0","M0","M0","M0","MX","M0","M0","M0","M1","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0","M0
      4 "ajcc_pathologic_n"      "N1","N0","N0","N3a","N2","N0","N0","N3","N1","N0","N0","N1","N1","N2","NX","N3a","N1","N1","N2","NX","N0","N1","N3","N1","N0","N2","N2","N1","N0","
      5 "ajcc_pathologic_stage"          "Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"Stage"S
      6 "ajcc_pathologic_t"      "T3""T2""T3""T3""T4a""T2""T3""T4a""TX""T1b""T3""T4""T4""T4a""TX""T4b""T3""T3""T4b""T2""T3""T3""T4""T4""T3""T2""T4b""T2""T3""TX""T1b""T4""T3""T4""T3"
      7 "ajcc_staging_system_edition"    "6th","7th","7th","7th","7th","7th","7th","7th","6th","7th","7th","6th","7th","7th","6th","7th","6th","7th","7th","6th","7th","7th","6th","5
      8 "alcohol_history"        "Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not"Not
      9 "alcohol_intensity"      null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,nul
     10 "bmi"    null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null
     11 "case_id"        "f72a26e8-7f96-4d86-b37b-7dc35f681133","6e03b415-84a1-4b91-8717-1a41edd4a255","9ef7582b-d4c1-4036-a1ed-ef65aa46fc60","4020b1b1-576d-4869-9ff5-552e3afb3ab5",
     12 "cigarettes_per_day"     null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,nul
     13 "classification_of_tumor"        "not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not"not
     14 "created_datetime"       null,"2019-04-28T15:49:21.905058-05:00",null,null,null,null,null,"2019-04-28T15:42:39.023165-05:00",null,null,null,null,"2019-04-28T15:52:49.326357-

转自“医学统计园”微信公众号,使用代码或转载请注明出处。

上一篇 下一篇

猜你喜欢

热点阅读