%%%%%%
2020-03-07 本文已影响0人
纵春水东流
一、结果
1、ruitehol
TP TN FP FN Sn Sp Accuracy MCC AUC Fscore
AAAAAG 55.5 52.3 6.0 9.2 0.8611364 0.9007420 0.8764394 0.7573759 0.8809392 0.8794544
AAGAAA 52.4 49.2 10.1 13.3 0.7981628 0.8362825 0.8127305 0.6300846 0.8172227 0.8159164
AATAAA 202.5 199.5 57.0 60.0 0.7722173 0.7786982 0.7745630 0.5500423 0.7754578 0.7756715
AATACA 34.7 37.7 9.3 6.3 0.8464419 0.8045571 0.8227273 0.6482072 0.8254995 0.8154168
AATAGA 16.9 16.2 1.6 2.3 0.8891735 0.9153497 0.8947724 0.7965120 0.9022616 0.8975811
AATATA 15.7 18.3 4.8 2.2 0.8865425 0.8079497 0.8290767 0.6757853 0.8472461 0.8109587
ACTAAA 25.1 27.2 9.4 7.3 0.7785348 0.7476896 0.7578669 0.5210172 0.7631122 0.7485714
AGTAAA 23.0 27.2 10.5 6.3 0.7882839 0.7266734 0.7492424 0.5067008 0.7574787 0.7296239
ATTAAA 90.8 90.5 29.2 29.5 0.7652125 0.7583934 0.7554167 0.5171202 0.7618029 0.7557712
CATAAA 15.2 15.9 5.3 4.6 0.7685621 0.7680430 0.7591754 0.5275986 0.7683026 0.7469583
GATAAA 18.2 19.0 4.8 4.0 0.8224295 0.8046294 0.8086957 0.6221677 0.8135294 0.8034097
TATAAA 30.1 30.8 8.9 8.2 0.7905583 0.7828915 0.7807692 0.5674452 0.7867249 0.7769580
2、多层感知机二分类
损失,精确度
AATAAA:[0.8702749238281177, 0.6274131271829937]
3、cnn
AATAAA:79.77% (+/- 3.01%)
4、rnn
5、vgg
6、cnn-rnn
二、代码
数据
AAAAAG.txt AATAGA.txt ATTAAA.txt negAAGAAA.txt negAATATA.txt negCATAAA.txt
AAGAAA.txt AATATA.txt CATAAA.txt negAATAAA.txt negACTAAA.txt negGATAAA.txt
AATAAA.txt ACTAAA.txt GATAAA.txt negAATACA.txt negAGTAAA.txt negTATAAA.txt
AATACA.txt AGTAAA.txt negAAAAAG.txt negAATAGA.txt negATTAAA.txt TATAAA.txt
1、ruitehol
#load library
library(ruimtehol)
library(itertools2)
require(magrittr)
require(caret)
######################
set.seed(177)
kmer <- function(x,k=5){
b=nchar(x[1])
for(i in 1:length(x)){
delta1=paste(substr(x[i],1,100),substr(x[i],107,206),sep = '')
delta2=NULL
for(j in 1:(b-6-k-1)){
delta2 <- c(delta2,substr(delta1,j,j+k-1))
}
x[i] <- paste(delta2,sep = '',collapse = ' ')
}
return(x)
}
fileNames <- dir('.')[c(1:11,24)]
#fileNames='AATAAA.txt'
total_access={}
for(filename in fileNames){
pData <- read.csv(filename,header = F,stringsAsFactors = F)[,1] %>% as.vector() %>% kmer()
negData <- read.csv(paste('neg',filename,sep = ''),header = F,stringsAsFactors = F)[,1] %>% as.vector() %>% kmer()
data = data.frame(rbind(cbind(pData,label=1), cbind(negData,label=0 )),stringsAsFactors = F)
require(caret)
folds<-createFolds(y=data$label,k=10) #根据training的laber-Species把数据集切分成10等份
access={}
for(i in 1:10){
train_x <- data[-folds[[i]],1]
train_y <- data[-folds[[i]],2]
test_x <- data[folds[[ i]],1]
test_y <- data[folds[[ i]],2]
#建立starspace模型
model <- embed_tagspace(x=train_x,y=train_y,
dim = 30,epoch = 1, loss = "hinge", adagrad = T,
similarity = "dot", negSearchLimit = 10,ngrams = 10,
minCount = 5)
#结果评估
result <- predict(model,test_x)
TN=TP=FN=FP=0
for(j in 1:length(test_x)){
if(test_y[j]==1 & result[[j]]$prediction[1,1]==1){TP=TP+1 }
if(test_y[j]==1 & result[[j]]$prediction[1,1]==0){FP=FP+1 }
if(test_y[j]==0 & result[[j]]$prediction[1,1]==0){TN=TN+1 }
if(test_y[j]==0 & result[[j]]$prediction[1,1]==1){FN=FN+1 }
}
Sn = TP/(TP+FN)
Sp=TN/(TN+FP)
Accuracy = (TP+TN)/(TN+FP+TP+FN)
MCC=(TP*TN-FP*FN)/sqrt((TP+FP)*(TN+FN)*(TN+FP)*(TP+FN))
AUC=(Sn+Sp)/2
Fscore=(2*TP)/(2*TP+FP+FN)
access=rbind(access,data.frame(TP,TN,FP,FN,Sn,Sp,Accuracy,MCC,AUC,Fscore))
}
total_access=rbind(total_access,apply(access,2,mean))
}
rownames(total_access) <- sapply(fileNames,function(x) substr(x,1,6),USE.NAMES = F)
total_access
2、多层感知机二分类
#加载包
import keras
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras import models
from keras import layers
from keras import losses
from keras import metrics
from keras import optimizers
## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import os
#设置工作目录
import os
os.chdir('/home/uu/Desktop/polyA_predict/data/polyadata/')
#读取数据
df1 = pd.read_csv('AATAAA.txt',header=None,names=['sequance'])
df1.loc[:,'labels']=1
df2 = pd.read_csv('negAATAAA.txt',header=None,names=['sequance'])
df2.loc[:,'labels']=0
df = pd.concat([df1,df2])
df['sequance']=df['sequance'].map(lambda x: list(x))
#向量化
vocabulary_size=4
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['sequance'])
sequences = tokenizer.texts_to_sequences(df['sequance'])
data = pad_sequences(sequences, maxlen=200)
labels = np.array(df['labels'])
# Generate dummy data
train=list(range(260,4931))
test=list(range(1,260))+list(range(4931,5190))
x_train = data[train]
y_train = labels[train]
x_test = data[test]
y_test = labels[test]
#建立网络、编译网络
model = Sequential()
model.add(Dense(128, input_dim=200, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
model.fit(x_train, y_train,
epochs=20,batch_size=128)
score = model.evaluate(x_test, y_test,batch_size=128)
print(score)
3、CNN
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation
from keras.layers import Embedding
from keras.layers import Conv2D, MaxPool2D,GlobalAveragePooling2D, MaxPooling2D,Flatten
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
#设置工作目录
import os
os.chdir('/home/uu/Desktop/polyA_predict/data/polyadata/')
np.random.seed(777)
#读取数据
df1 = pd.read_csv('AATAAA.txt',header=None,names=['sequance'])
df1.loc[:,'labels']=1
df2 = pd.read_csv('negAATAAA.txt',header=None,names=['sequance'])
df2.loc[:,'labels']=0
df = pd.concat([df1,df2])#sequences,labels
df['sequance']=df['sequance'].map(lambda x: list(x))#list,labels
#向量化
vocabulary_size=200
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['sequance'])
# vocabulary_size=200
# tokenizer = np_utils.Tokenizer (num_words= vocabulary_size)
# tokenizer.fit_on_texts(df['sequance'])
sequences = tokenizer.texts_to_sequences(df['sequance'])#list[[],[]]
sequences=[list(np.asarray(x)-1) for x in sequences]
# sequences[1]
# [1 0 0 0 2 1 2 2 1 1 2 2 1 1 0 3 0 2 1 1 2 1 2 1 0 2 1 0 2 1 1 0 0 2 2 0 3
# 1 0 1 2 2 3 0 0 2 2 0 2 1 3 1 3 0 2 2 3 3 0 0 2 2 0 1 2 1 3 0 1 3 2 0 0 2
# 0 2 1 0 1 1 1 3 0 0 0 1 2 3 0 0 2 0 0 0 1 2 0 0 2 0 0 0 1 0 0 0 1 3 1 1 1
# 2 2 3 1 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 1
# 3 0 1 3 0 2 1 1 3 1 3 0 0 3 3 1 2 2 2 0 3 0 3 0 3 3 1 3 0 2 0 0 0 3 0 3 3
# 1 2 1 0 1 0 1 0 1 3 0 0 2 2 1 3 3 3 0 3 3]
#删除中间6个元素剩下200个
for i in range(len(sequences)):
del sequences[i][100:106]
#转化为数组[-1,200,4]
data=[np_utils.to_categorical(x)for x in sequences]#list[array()]
data=np.concatenate((data),axis=0).reshape(-1,1,200,4)# print(data.shape)# (5190, 200, 4)
#labels = np_utils.to_categorical(np.asarray(df.iloc[:,1]),num_classes=2)#numpy.ndarray,5190,2
labels = np.asarray(df.iloc[:,1])
#print(labels)
#分割训练数据和测试数据
# train=list(range(260,4931))
# test=list(range(1,260))+list(range(4931,5190))
# x_train = data[train]
# y_train = labels[train]
# x_test = data[test]
# y_test = labels[test]
# print("x_train.shape",x_train.shape)
# print("y_train.shape",y_train.shape)
# print("x_test.shape",x_test.shape)
# print("y_test.shape",y_test.shape)
seed=777
np.random.seed(777)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
X=data
Y=labels
for train, test in kfold.split(X, Y):
# create model
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=(3,4), padding='same',
input_shape=(1,200,4) ))
model.add(Conv2D(filters=64,kernel_size=(6,4),padding='same'))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(3,4),padding='same'))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(64))
model.add(Dense(1))
#compile
adam=Adam(lr=1e-4)
model.compile(loss='binary_crossentropy',
optimizer=adam,
metrics=['accuracy'])
# Fit the model
model.fit(X[train], Y[train], epochs=30, batch_size=64, verbose=0)
# evaluate the model
scores = model.evaluate(X[test], Y[test], verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))