【实验】中文分词word2vec实践
2018-08-31 本文已影响240人
账号已删除
网上下一份倚天屠龙记的小说txt
image.png下述代码分词后的txt文件
image.png代码解释
- 用gensim做word2vec,用jieba做中文分词
import gensim
import jieba
import warnings
warnings.filterwarnings('ignore')
- 为jieba分词提供一些名词类的分词参考
name=['峨眉派','倚天剑','屠龙刀','金毛狮王','青翼蝠王',\
'谢逊','杨逍','冷谦','赵敏','乾坤大挪移','明教',\
'郭襄','昆仑派','六大派','少林派','九阳真经','紫衫龙王','韦一笑']
for col in name:
jieba.suggest_freq(col,True)
- 删除特殊字符或自己向指定的一些字符
#分词
fin=open(filename_novel,'r')
fou=open(filename_segment,'w')
for line in fin.readlines():
newline=jieba.cut(line,cut_all=False)
str_out=' '.join(newline)\
.replace(',','').replace('。','')\
.replace(',','').replace('.','')\
.replace('?','').replace('?','')\
.replace('!','').replace('!','')\
.replace('(','').replace('(','')\
.replace(')','').replace(')','')\
.replace('“','').replace('”','').replace('"','')\
.replace('’','').replace('‘','').replace('\'','')\
.replace('、','').replace(':','').replace(':','')\
.replace('[','').replace(']','')\
.replace('{','').replace('}','')\
.replace('<','').replace('>','')\
.replace('《','').replace('》','')\
.replace(';','').replace(';','')\
.replace('-','').replace('_','')\
.replace('·','').replace('+','')\
.replace('…','')\
.replace('〗','').replace('〖','')
fou.write(str_out)
fou.close()
4、训练
sentences=gensim.models.word2vec.LineSentence(filename_segment)
model=gensim.models.word2vec.Word2Vec(sentences,size=20,window=5,min_count=10,workers=4)
model.save(filename_model)
5、查看相似度
def getSimilarity(s):
print('similarity by word of :'+s)
for k in model.similar_by_vector(s,5):
print(k[0],k[1])
getSimilarity('张无忌')
image.png
image.png
image.png
image.png