%%%%%%

2020-03-07  本文已影响0人  纵春水东流

一、结果

1、ruitehol

          TP    TN   FP   FN        Sn        Sp  Accuracy       MCC       AUC    Fscore
AAAAAG  55.5  52.3  6.0  9.2 0.8611364 0.9007420 0.8764394 0.7573759 0.8809392 0.8794544
AAGAAA  52.4  49.2 10.1 13.3 0.7981628 0.8362825 0.8127305 0.6300846 0.8172227 0.8159164
AATAAA 202.5 199.5 57.0 60.0 0.7722173 0.7786982 0.7745630 0.5500423 0.7754578 0.7756715
AATACA  34.7  37.7  9.3  6.3 0.8464419 0.8045571 0.8227273 0.6482072 0.8254995 0.8154168
AATAGA  16.9  16.2  1.6  2.3 0.8891735 0.9153497 0.8947724 0.7965120 0.9022616 0.8975811
AATATA  15.7  18.3  4.8  2.2 0.8865425 0.8079497 0.8290767 0.6757853 0.8472461 0.8109587
ACTAAA  25.1  27.2  9.4  7.3 0.7785348 0.7476896 0.7578669 0.5210172 0.7631122 0.7485714
AGTAAA  23.0  27.2 10.5  6.3 0.7882839 0.7266734 0.7492424 0.5067008 0.7574787 0.7296239
ATTAAA  90.8  90.5 29.2 29.5 0.7652125 0.7583934 0.7554167 0.5171202 0.7618029 0.7557712
CATAAA  15.2  15.9  5.3  4.6 0.7685621 0.7680430 0.7591754 0.5275986 0.7683026 0.7469583
GATAAA  18.2  19.0  4.8  4.0 0.8224295 0.8046294 0.8086957 0.6221677 0.8135294 0.8034097
TATAAA  30.1  30.8  8.9  8.2 0.7905583 0.7828915 0.7807692 0.5674452 0.7867249 0.7769580

2、多层感知机二分类

损失,精确度
AATAAA:[0.8702749238281177, 0.6274131271829937]

3、cnn

AATAAA:79.77% (+/- 3.01%)

4、rnn

5、vgg

6、cnn-rnn

二、代码

数据

AAAAAG.txt  AATAGA.txt  ATTAAA.txt     negAAGAAA.txt  negAATATA.txt  negCATAAA.txt
AAGAAA.txt  AATATA.txt  CATAAA.txt     negAATAAA.txt  negACTAAA.txt  negGATAAA.txt
AATAAA.txt  ACTAAA.txt  GATAAA.txt     negAATACA.txt  negAGTAAA.txt  negTATAAA.txt
AATACA.txt  AGTAAA.txt  negAAAAAG.txt  negAATAGA.txt  negATTAAA.txt  TATAAA.txt

1、ruitehol

#load library
library(ruimtehol)
library(itertools2)
require(magrittr) 
require(caret)
######################
set.seed(177)

kmer <- function(x,k=5){
  b=nchar(x[1])
  for(i in 1:length(x)){
    delta1=paste(substr(x[i],1,100),substr(x[i],107,206),sep = '')
    delta2=NULL
    for(j in 1:(b-6-k-1)){
      delta2 <- c(delta2,substr(delta1,j,j+k-1))
    }
    x[i] <- paste(delta2,sep = '',collapse = ' ')
  }
  return(x)
}

fileNames <- dir('.')[c(1:11,24)]
#fileNames='AATAAA.txt'
total_access={}
for(filename in fileNames){
  pData <- read.csv(filename,header = F,stringsAsFactors = F)[,1]  %>% as.vector() %>% kmer()
  negData <- read.csv(paste('neg',filename,sep = ''),header = F,stringsAsFactors = F)[,1] %>% as.vector() %>% kmer()
  data = data.frame(rbind(cbind(pData,label=1), cbind(negData,label=0 )),stringsAsFactors = F)
  
  require(caret)
  folds<-createFolds(y=data$label,k=10) #根据training的laber-Species把数据集切分成10等份
  
  access={}
  for(i in 1:10){
    train_x <- data[-folds[[i]],1]
    train_y <- data[-folds[[i]],2]
  
    test_x  <- data[folds[[ i]],1]
    test_y  <- data[folds[[ i]],2]
    #建立starspace模型
    model <- embed_tagspace(x=train_x,y=train_y,
                            dim = 30,epoch = 1, loss = "hinge", adagrad = T, 
                            similarity = "dot", negSearchLimit = 10,ngrams = 10,
                            minCount = 5)
    
    #结果评估
    result <- predict(model,test_x)
    TN=TP=FN=FP=0
    for(j in 1:length(test_x)){
      if(test_y[j]==1 & result[[j]]$prediction[1,1]==1){TP=TP+1 }
      if(test_y[j]==1 & result[[j]]$prediction[1,1]==0){FP=FP+1 }
      if(test_y[j]==0 & result[[j]]$prediction[1,1]==0){TN=TN+1 }
      if(test_y[j]==0 & result[[j]]$prediction[1,1]==1){FN=FN+1 }
    }
    Sn = TP/(TP+FN)
    Sp=TN/(TN+FP)
    Accuracy = (TP+TN)/(TN+FP+TP+FN)
    MCC=(TP*TN-FP*FN)/sqrt((TP+FP)*(TN+FN)*(TN+FP)*(TP+FN))
    AUC=(Sn+Sp)/2
    Fscore=(2*TP)/(2*TP+FP+FN)
    access=rbind(access,data.frame(TP,TN,FP,FN,Sn,Sp,Accuracy,MCC,AUC,Fscore))
  }
  total_access=rbind(total_access,apply(access,2,mean))
}
rownames(total_access) <- sapply(fileNames,function(x) substr(x,1,6),USE.NAMES = F)
total_access



2、多层感知机二分类

#加载包
import keras
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

from keras import models
from keras import layers
from keras import losses
from keras import metrics
from keras import optimizers
## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import os
#设置工作目录
import os
os.chdir('/home/uu/Desktop/polyA_predict/data/polyadata/')

#读取数据
df1 = pd.read_csv('AATAAA.txt',header=None,names=['sequance'])
df1.loc[:,'labels']=1

df2 = pd.read_csv('negAATAAA.txt',header=None,names=['sequance'])
df2.loc[:,'labels']=0
df = pd.concat([df1,df2])
df['sequance']=df['sequance'].map(lambda x: list(x))
#向量化
vocabulary_size=4
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['sequance'])
                       
sequences = tokenizer.texts_to_sequences(df['sequance'])
data = pad_sequences(sequences, maxlen=200)
labels = np.array(df['labels'])

# Generate dummy data
train=list(range(260,4931))
test=list(range(1,260))+list(range(4931,5190))
x_train = data[train]
y_train = labels[train]
x_test = data[test]
y_test = labels[test]
#建立网络、编译网络
model = Sequential()
model.add(Dense(128, input_dim=200, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          epochs=20,batch_size=128)
score = model.evaluate(x_test, y_test,batch_size=128)

print(score)

3、CNN

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation
from keras.layers import Embedding
from keras.layers import Conv2D, MaxPool2D,GlobalAveragePooling2D, MaxPooling2D,Flatten
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

#设置工作目录
import os
os.chdir('/home/uu/Desktop/polyA_predict/data/polyadata/')
np.random.seed(777)
#读取数据
df1 = pd.read_csv('AATAAA.txt',header=None,names=['sequance'])
df1.loc[:,'labels']=1
df2 = pd.read_csv('negAATAAA.txt',header=None,names=['sequance'])
df2.loc[:,'labels']=0
df = pd.concat([df1,df2])#sequences,labels
df['sequance']=df['sequance'].map(lambda x: list(x))#list,labels

#向量化
vocabulary_size=200
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['sequance'])
# vocabulary_size=200
# tokenizer = np_utils.Tokenizer (num_words= vocabulary_size)
# tokenizer.fit_on_texts(df['sequance'])                      
sequences = tokenizer.texts_to_sequences(df['sequance'])#list[[],[]]
sequences=[list(np.asarray(x)-1) for x in sequences]
# sequences[1]
# [1 0 0 0 2 1 2 2 1 1 2 2 1 1 0 3 0 2 1 1 2 1 2 1 0 2 1 0 2 1 1 0 0 2 2 0 3
#  1 0 1 2 2 3 0 0 2 2 0 2 1 3 1 3 0 2 2 3 3 0 0 2 2 0 1 2 1 3 0 1 3 2 0 0 2
#  0 2 1 0 1 1 1 3 0 0 0 1 2 3 0 0 2 0 0 0 1 2 0 0 2 0 0 0 1 0 0 0 1 3 1 1 1
#  2 2 3 1 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 1
#  3 0 1 3 0 2 1 1 3 1 3 0 0 3 3 1 2 2 2 0 3 0 3 0 3 3 1 3 0 2 0 0 0 3 0 3 3
#  1 2 1 0 1 0 1 0 1 3 0 0 2 2 1 3 3 3 0 3 3]

#删除中间6个元素剩下200个
for i in range(len(sequences)):
    del sequences[i][100:106]

#转化为数组[-1,200,4]
data=[np_utils.to_categorical(x)for x in sequences]#list[array()]
data=np.concatenate((data),axis=0).reshape(-1,1,200,4)# print(data.shape)# (5190, 200, 4)
#labels = np_utils.to_categorical(np.asarray(df.iloc[:,1]),num_classes=2)#numpy.ndarray,5190,2
labels = np.asarray(df.iloc[:,1])
#print(labels)
#分割训练数据和测试数据
# train=list(range(260,4931))
# test=list(range(1,260))+list(range(4931,5190))

# x_train = data[train]
# y_train = labels[train]
# x_test =  data[test]
# y_test = labels[test]
# print("x_train.shape",x_train.shape)
# print("y_train.shape",y_train.shape)
# print("x_test.shape",x_test.shape)
# print("y_test.shape",y_test.shape)


seed=777
np.random.seed(777)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []
X=data
Y=labels
for train, test in kfold.split(X, Y):
    # create model
    model = Sequential()
    model.add(Conv2D(filters=16, kernel_size=(3,4), padding='same',
                     input_shape=(1,200,4)  ))

    model.add(Conv2D(filters=64,kernel_size=(6,4),padding='same'))
    model.add(Activation('relu'))

    model.add(MaxPool2D(pool_size=(3,4),padding='same'))
    model.add(Dropout(0.5))

    model.add(Flatten())
    model.add(Dense(64))

    model.add(Dense(1))

    #compile
    adam=Adam(lr=1e-4)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    
    # Fit the model
    model.fit(X[train], Y[train], epochs=30, batch_size=64, verbose=0)
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
上一篇 下一篇

猜你喜欢

热点阅读