TCGA 临床信息整理例子
2019-09-29 本文已影响0人
陈宇乔
rm(list = ls())
load(file = './Rdata/step0_original_data.Rdata')
load(file = './Rdata/step00_idtransed.Rdata')
expr_TCGA<- new_exprSet
group_list=ifelse(as.numeric(substr(colnames(expr_TCGA),14,15)) < 10,'tumor','normal')
expr_TCGA=na.omit(expr_TCGA)
expr_TCGA=expr_TCGA[,group_list=='tumor']
###### 筛选clinical data
group_list=ifelse(as.numeric(substr(pheno$submitter_id.samples,14,15)) < 10,'tumor','normal')
pheno<- pheno[group_list=='tumor',]
pheno$submitter_id.samples<- gsub('-','.',pheno$submitter_id.samples)
pheno<- pheno[pheno$submitter_id.samples%in%colnames(expr_TCGA),]
###### step2 整理临床信息
# pheno<- pheno
colnames(pheno)
pheno$vital_status.diagnoses
pheno$pathologic_T
pheno$pathologic_N
pheno$pathologic_M
colnames(pheno)
pheno<- pheno[,c('submitter_id.samples','gender.demographic',
'therapy_type','drug_name',
'country_of_procurement',
'tumor_stage.diagnoses','tumor_grade.diagnoses',
'neoplasm_histologic_grade','planned_surgery_status',
'vital_status.diagnoses','days_to_death.diagnoses',
'days_to_last_follow_up.diagnoses','pathologic_T',
'pathologic_N','pathologic_M'),]
# pheno$days_to_death.diagnoses<- as.numeric(as.character(pheno$days_to_death))
# pheno$days_to_last_followup<- as.numeric(as.character(pheno$days_to_last_followup))
# pheno$time<- ifelse(pheno$days_to_last_followup == '',pheno$days_to_death,pheno$days_to_last_followup)
pheno$time<- ifelse(is.na(pheno$days_to_death.diagnoses) != TRUE,pheno$days_to_death.diagnoses,pheno$days_to_last_follow_up.diagnoses)
pheno$event<- ifelse(pheno$vital_status.diagnoses=='alive','0','1')
pheno$pathologic_T[grepl('T4a',pheno$pathologic_T)]<- c('T4')
pheno$pathologic_T<- factor(pheno$pathologic_T)
# pheno$pathologic_T<- ifelse(pheno$pathologic_T== 'T1','T1','T2-4')
# pheno$pathologic_T<- ifelse(pheno$pathologic_T== 'T1'|pheno$pathologic_T=='T2','T1-2','T3-4')
pheno$pathologic_N<- factor(pheno$pathologic_N)
pheno$pathologic_N[pheno$pathologic_N==''|pheno$pathologic_N=='TX']<- NA
# pheno$pathologic_N<- ifelse(pheno$pathologic_N!='N0','N+','N0')
pheno$pathologic_M<- ifelse(pheno$pathologic_M== 'M0','M0','M+')
pheno$pathologic_M<- factor(pheno$pathologic_M,levels = c('M0',"M+"))
pheno$tumor_stage<- pheno$tumor_stage.diagnoses
pheno$tumor_stage[grepl('stage iii',pheno$tumor_stage.diagnoses)]<- c('stage iii')
pheno$tumor_stage[grepl('stage iv',pheno$tumor_stage.diagnoses) ]<- c('stage iv')
pheno$tumor_stage[grepl('stage ii',pheno$tumor_stage.diagnoses) & (!grepl('stage iii',pheno$tumor_stage.diagnoses))]<- c('stage ii')
pheno$tumor_stage[grepl('stage i',pheno$tumor_stage.diagnoses) & (!grepl('stage iii',pheno$tumor_stage.diagnoses))
& (!grepl('stage ii',pheno$tumor_stage.diagnoses)) & (!grepl('stage iv',pheno$tumor_stage.diagnoses))]<- c('stage i')
pheno$tumor_stage[pheno$tumor_stage.diagnoses=='not reported']<- NA
pheno$tumor_stage<- factor(pheno$tumor_stage)