nltk 获取 gutenberg 语料,gensim 生成词库
2017-08-02 本文已影响24人
磐创AI_聊天机器人
nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码
正在尝试基于 Tensorflow LSTM 模型开发另外一个项目,需要自然语言处理的工具和语料。
import nltk
import numpy as np
from nltk.corpus import gutenberg
from gensim import corpora, models, similarities
class Book2Array(object):
sentences=None
token2id_dic=None
def __init__(self,sentences):
self.sentences=sentences
self.token2id_dic=self.get_token2id_dic()
def get_sentences(self):
#macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
#print(macbeth_sentences)
#print(type(macbeth_sentences))
print(len(macbeth_sentences))
sentences_list=[sentence for sentence in self.sentences]
#print(type(macbeth_list))
return sentences_list
def get_token2id_dic(self):
# collect statistics about all tokens
dictionary = corpora.Dictionary(self.sentences)
# remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(len(dictionary))
token2id_dic=dictionary.token2id
return token2id_dic
def word2onehot(self,word):
onehot_list=np.zeros(8192)
onehot_list[self.token2id_dic[word]]=1
return onehot_list
def sent2vec(self,sentence):
vec=[]
if(len(sentence)>20):
sentence=sentence[0:20]
for word in sentence:
onehot_list=self.word2onehot(word)
vec.append(onehot_list)
len_vec=len(vec)
for i in range(0,20-len_vec):
vec.append(np.zeros(8192))
#print(len(vec))
vec_np=np.asarray(vec)
return vec_np
def sentences2array(self):
array=[]
for sentence in self.sentences:
array.append(self.sent2vec(sentence))
return array
def gen_batch(self):
pass
if __name__ == '__main__':
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
book_array=Book2Array(macbeth_sentences)
book_array.get_sentences()
array=book_array.sentences2array()
np_array=np.array(array[0])
print(np_array.shape)