test10 针对蛋白质鉴定列表 进行PCA分析
2020-07-21 本文已影响0人
夕颜00
一、准备文件:
1、下机文件
蛋白质鉴定列表.xlsx
三种格式:DIA、labelFree、TMT
- TMT 文件格式
-
labelFree 文件格式
image.png -
DIA 文件格式
image.png
2、sample.txt
sample label sib group
CHOL_1 CHOL_1 1 CHOL
CHOL_2 CHOL_2 2 CHOL
CHOL_3 CHOL_3 3 CHOL
Ctrl_1 Ctrl_1 1 Ctrl
Ctrl_2 Ctrl_2 2 Ctrl
Ctrl_3 Ctrl_3 3 Ctrl
二、脚本
#! /usr/bin/env python
# _*_ coding: utf-8 _*_
#script for DIA/labelFree/TMT PCA
# eg: python3 pca_main.py 蛋白质鉴定列表.xlsx sample.txt
import pandas as pd
import re
import sys
import os
if __name__ == "__main__":
if len(sys.argv) != 3:
print("usage: python pca_main.py 蛋白质鉴定列表.xlsx sample.txt ")
sys.exit()
# file = "E:/Script/python/test10_pca/蛋白质鉴定列表DIA.xlsx"
file = sys.argv[1]
out = "pca.txt"
df = pd.read_excel(file, sheet_name=0)
columns = df.columns.values.tolist()
# print(columns)
cluster_col = ["Accession"]
for i in columns:
if "raw.PG.Quantity" in i:
col_Acc = [str(i).split(";")[0] for i in df["PG.ProteinAccessions"]]
df["Accession"] = col_Acc
cluster_col.append(i)
df1 = df.loc[:, cluster_col]
df1.columns = df1.columns.str.replace(".raw.PG.Quantity", "")
df1.columns = [re.sub("\[\d+\]", "", v) for v in df1.columns]
df1.columns = [re.sub("\s+$", "", v) for v in df1.columns]
df1.columns = [re.sub("^\s+", "", v) for v in df1.columns] #替换空格
elif "LFQ intensity" in i:
df["Accession"] = df["Protein IDs"]
cluster_col.append(i)
df1 = df.loc[:, cluster_col]
df1.columns = df1.columns.str.replace("LFQ intensity ", "")
elif "Abundances" in i:
cluster_col.append(i)
df1 = df.loc[:, cluster_col]
df1.columns = df1.columns.str.replace("Abundances ", "")
df1.set_index("Accession", inplace=True)
df1 = df1.dropna(how='all')
nsmallest = df1.stack().min()
df1 = df1.fillna(value=float(nsmallest))
df1.reset_index(level=0, inplace=True)
print(type(df1))
df1.to_csv(out, sep="\t", index=False)
command1 = "Rscript /xtt/PCA/bin/PCA.R -m {0} -i pca.txt -o pna.png".format(sys.argv[2])
command2 = "Rscript /xtt/PCA/bin/PCA.R -m {0} -i pca.txt -o Principal_Component_Analysis.pdf".format(sys.argv[2])
os.system(command1)
os.system(command2)
三、结果文件
├── pca.txt
├── pna.png
├── Principal_Component_Analysis.pdf