利用Word2vec生成句向量(二)
在之前的文章《利用Word2vec生成句向量(一)》中,介绍了两种句向量的生成方法,本文将介绍一种号称"简单却具有一定竞争力"的句向量表示方法:SIF加权平均
论文见A simple but tough-to-beat baseline for sentence embeddings
本文依旧不会对论文及其原理做过多解读,我更着眼于源码的修改,使源码能运行起来跑得通,人人都能拿来就用
和之前提到的TFIDF加权相似,SIF也是对于每个词向量给出一定的权重,并且权重的大小也是基于词频的,论文把该方法命名为平滑倒词频。
官方给出了SIF的源码,但是源码只支持英文,并且只支持python2,不支持python3,本文将会对源码进行改造使其适用于中文和python3版本,并进行简单解读。
首先,将SIF源码拷贝到本地
git clone https://github.com/PrincetonML/SIF.git
注意,SIF源码本身存在很多BUG,Glove词向量中字段一塌糊涂,向量中的两个值都会连在一块,根本读不出来,如果想运行原有的SIF需要进行很多调试工作。
主程序入口sif_embedding.py
修改部分:
- 将原有代码中的Glove词向量变为自己的Word2vec词向量Word2Vec.load(‘词向量文件’)
- 原有源码中有一个enwiki_vocab_min200.txt的文件,源码中没有提及这个文件的来源,通过观察可以看出文件存储的是每个单词的词频。
但是我们用gensim训练出的Word2vec模型中本身就有类似的统计参数,无需额外获取,只需要把model_100.wv.vocab传进去再修改getWordWeight的访问方式即可
from src import data_io, params, SIF_embedding
import os
from gensim.models import Word2Vec
model_100 = Word2Vec.load(os.path.join('/media/brx/TOSHIBA EXT/wiki_zh_word2vec/', 'ngram_100_5_90w.bin'))
words = {}
for index, word in enumerate(model_100.wv.index2entity):
words[word] = index
We = model_100.wv.vectors
# input
weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme
sentences = ['这是一个测试句子', '这是另一个测试句子']
# load word vectors
# (words, We) = data_io.getWordmap(wordfile)
# load word weights
word2weight = data_io.getWordWeight(model_100.wv.vocab, weightpara) # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights
# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
print()
数据处理data_io.py
修改部分:
- getSeq函数中将原有的split()改为jieba分词
- sentences2idx函数中定义seq1=[](源代码BUG)
- getWordWeight中实现了平滑倒词频,函数中将当前词的词频和总词数从原来文件中读取,改为从Word2vec模型中读取
- iteritems->items, xrange->range (python2->python3)
from __future__ import print_function
import numpy as np
import pickle
from src.tree import tree
import jieba
#from theano import config
def getWordmap(textfile):
words={}
We = []
f = open(textfile,'r', errors='ignore')
lines = f.readlines()
for (n,i) in enumerate(lines):
i=i.split()
j = 1
v = []
while j < len(i):
if i[j] == '.':
v.append(0)
else:
v.append(float(i[j]))
j += 1
words[i[0]] = n
We.append(v)
return (words, np.array(We))
def prepare_data(list_of_seqs):
lengths = [len(s) for s in list_of_seqs]
n_samples = len(list_of_seqs)
maxlen = np.max(lengths)
x = np.zeros((n_samples, maxlen)).astype('int32')
x_mask = np.zeros((n_samples, maxlen)).astype('float32')
for idx, s in enumerate(list_of_seqs):
x[idx, :lengths[idx]] = s
x_mask[idx, :lengths[idx]] = 1.
x_mask = np.asarray(x_mask, dtype='float32')
return x, x_mask
def lookupIDX(words,w):
w = w.lower()
if len(w) > 1 and w[0] == '#':
w = w.replace("#","")
if w in words:
return words[w]
elif 'UUUNKKK' in words:
return words['UUUNKKK']
else:
return len(words) - 1
def getSeq(p1,words):
p1 = jieba.cut(p1)
X1 = []
for i in p1:
X1.append(lookupIDX(words,i))
return X1
def getSeqs(p1,p2,words):
p1 = p1.split()
p2 = p2.split()
X1 = []
X2 = []
for i in p1:
X1.append(lookupIDX(words,i))
for i in p2:
X2.append(lookupIDX(words,i))
return X1, X2
def get_minibatches_idx(n, minibatch_size, shuffle=False):
idx_list = np.arange(n, dtype="int32")
if shuffle:
np.random.shuffle(idx_list)
minibatches = []
minibatch_start = 0
for i in range(n // minibatch_size):
minibatches.append(idx_list[minibatch_start:
minibatch_start + minibatch_size])
minibatch_start += minibatch_size
if (minibatch_start != n):
minibatches.append(idx_list[minibatch_start:])
return zip(range(len(minibatches)), minibatches)
def getSimEntDataset(f,words,task):
data = open(f,'r')
lines = data.readlines()
examples = []
for i in lines:
i=i.strip()
if(len(i) > 0):
i=i.split('\t')
if len(i) == 3:
if task == "sim":
e = (tree(i[0], words), tree(i[1], words), float(i[2]))
examples.append(e)
elif task == "ent":
e = (tree(i[0], words), tree(i[1], words), i[2])
examples.append(e)
else:
raise ValueError('Params.traintype not set correctly.')
else:
print(i)
return examples
def getSentimentDataset(f,words):
data = open(f,'r')
lines = data.readlines()
examples = []
for i in lines:
i=i.strip()
if(len(i) > 0):
i=i.split('\t')
if len(i) == 2:
e = (tree(i[0], words), i[1])
examples.append(e)
else:
print(i)
return examples
def getDataSim(batch, nout):
g1 = []
g2 = []
for i in batch:
g1.append(i[0].embeddings)
g2.append(i[1].embeddings)
g1x, g1mask = prepare_data(g1)
g2x, g2mask = prepare_data(g2)
scores = []
if nout <=0:
return (scores, g1x, g1mask, g2x, g2mask)
for i in batch:
temp = np.zeros(nout)
score = float(i[2])
ceil, fl = int(np.ceil(score)), int(np.floor(score))
if ceil == fl:
temp[fl - 1] = 1
else:
temp[fl - 1] = ceil - score
temp[ceil - 1] = score - fl
scores.append(temp)
scores = np.matrix(scores) + 0.000001
scores = np.asarray(scores, dtype='float32')
return (scores, g1x, g1mask, g2x, g2mask)
def getDataEntailment(batch):
g1 = []; g2 = []
for i in batch:
g1.append(i[0].embeddings)
g2.append(i[1].embeddings)
g1x, g1mask = prepare_data(g1)
g2x, g2mask = prepare_data(g2)
scores = []
for i in batch:
temp = np.zeros(3)
label = i[2].strip()
if label == "CONTRADICTION":
temp[0]=1
if label == "NEUTRAL":
temp[1]=1
if label == "ENTAILMENT":
temp[2]=1
scores.append(temp)
scores = np.matrix(scores)+0.000001
scores = np.asarray(scores,dtype='float32')
return (scores,g1x,g1mask,g2x,g2mask)
def getDataSentiment(batch):
g1 = []
for i in batch:
g1.append(i[0].embeddings)
g1x, g1mask = prepare_data(g1)
scores = []
for i in batch:
temp = np.zeros(2)
label = i[1].strip()
if label == "0":
temp[0]=1
if label == "1":
temp[1]=1
scores.append(temp)
scores = np.matrix(scores)+0.000001
scores = np.asarray(scores,dtype='float32')
return (scores,g1x,g1mask)
def sentences2idx(sentences, words):
"""
Given a list of sentences, output array of word indices that can be fed into the algorithms.
:param sentences: a list of sentences
:param words: a dictionary, words['str'] is the indices of the word 'str'
:return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
"""
seq1 = []
for i in sentences:
seq1.append(getSeq(i,words))
x1,m1 = prepare_data(seq1)
return x1, m1
def sentiment2idx(sentiment_file, words):
"""
Read sentiment data file, output array of word indices that can be fed into the algorithms.
:param sentiment_file: file name
:param words: a dictionary, words['str'] is the indices of the word 'str'
:return: x1, m1, golds. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location), golds[i] is the label (0 or 1) for sentence i.
"""
f = open(sentiment_file,'r')
lines = f.readlines()
golds = []
seq1 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; score = int(i[1]) # score are labels 0 and 1
X1 = getSeq(p1,words)
seq1.append(X1)
golds.append(score)
x1,m1 = prepare_data(seq1)
return x1, m1, golds
def sim2idx(sim_file, words):
"""
Read similarity data file, output array of word indices that can be fed into the algorithms.
:param sim_file: file name
:param words: a dictionary, words['str'] is the indices of the word 'str'
:return: x1, m1, x2, m2, golds. x1[i, :] is the word indices in the first sentence in pair i, m1[i,:] is the mask for the first sentence in pair i (0 means no word at the location), golds[i] is the score for pair i (float). x2 and m2 are similar to x1 and m2 but for the second sentence in the pair.
"""
f = open(sim_file,'r')
lines = f.readlines()
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = float(i[2])
X1, X2 = getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = prepare_data(seq1)
x2,m2 = prepare_data(seq2)
return x1, m1, x2, m2, golds
def entailment2idx(sim_file, words):
"""
Read similarity data file, output array of word indices that can be fed into the algorithms.
:param sim_file: file name
:param words: a dictionary, words['str'] is the indices of the word 'str'
:return: x1, m1, x2, m2, golds. x1[i, :] is the word indices in the first sentence in pair i, m1[i,:] is the mask for the first sentence in pair i (0 means no word at the location), golds[i] is the label for pair i (CONTRADICTION NEUTRAL ENTAILMENT). x2 and m2 are similar to x1 and m2 but for the second sentence in the pair.
"""
f = open(sim_file,'r')
lines = f.readlines()
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = i[2]
X1, X2 = getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = prepare_data(seq1)
x2,m2 = prepare_data(seq2)
return x1, m1, x2, m2, golds
def getWordWeight(word2weight, a=1e-3):
if a <=0: # when the parameter makes no sense, use unweighted
a = 1.0
# word2weight = {}
# with open(weightfile) as f:
# lines = f.readlines()
# N = 0
# for i in lines:
# i=i.strip()
# if(len(i) > 0):
# i=i.split()
# if(len(i) == 2):
# word2weight[i[0]] = float(i[1])
# N += float(i[1])
# else:
# print(i)
for key, value in word2weight.items():
word2weight[key] = a / (a + value.count/value.sample_int)
return word2weight
def getWeight(words, word2weight):
weight4ind = {}
for word, ind in words.items():
if word in word2weight:
weight4ind[ind] = word2weight[word]
else:
weight4ind[ind] = 1.0
return weight4ind
def seq2weight(seq, mask, weight4ind):
weight = np.zeros(seq.shape).astype('float32')
for i in range(seq.shape[0]):
for j in range(seq.shape[1]):
if mask[i,j] > 0 and seq[i,j] >= 0:
weight[i,j] = weight4ind[seq[i,j]]
weight = np.asarray(weight, dtype='float32')
return weight
def getIDFWeight(wordfile, save_file=''):
def getDataFromFile(f, words):
f = open(f,'r')
lines = f.readlines()
golds = []
seq1 = []
seq2 = []
for i in lines:
i = i.split("\t")
p1 = i[0]; p2 = i[1]; score = float(i[2])
X1, X2 = getSeqs(p1,p2,words)
seq1.append(X1)
seq2.append(X2)
golds.append(score)
x1,m1 = prepare_data(seq1)
x2,m2 = prepare_data(seq2)
return x1,m1,x2,m2
prefix = "../data/"
farr = ["MSRpar2012"]
#farr = ["MSRpar2012",
# "MSRvid2012",
# "OnWN2012",
# "SMTeuro2012",
# "SMTnews2012", # 4
# "FNWN2013",
# "OnWN2013",
# "SMT2013",
# "headline2013", # 8
# "OnWN2014",
# "deft-forum2014",
# "deft-news2014",
# "headline2014",
# "images2014",
# "tweet-news2014", # 14
# "answer-forum2015",
# "answer-student2015",
# "belief2015",
# "headline2015",
# "images2015", # 19
# "sicktest",
# "twitter",
# "JHUppdb",
# "anno-dev",
# "anno-test"]
(words, We) = getWordmap(wordfile)
df = np.zeros((len(words),))
dlen = 0
for f in farr:
g1x,g1mask,g2x,g2mask = getDataFromFile(prefix+f, words)
dlen += g1x.shape[0]
dlen += g2x.shape[0]
for i in range(g1x.shape[0]):
for j in range(g1x.shape[1]):
if g1mask[i,j] > 0:
df[g1x[i,j]] += 1
for i in range(g2x.shape[0]):
for j in range(g2x.shape[1]):
if g2mask[i,j] > 0:
df[g2x[i,j]] += 1
weight4ind = {}
for i in range(len(df)):
weight4ind[i] = np.log2((dlen+2.0)/(1.0+df[i]))
if save_file:
pickle.dump(weight4ind, open(save_file, 'w'))
return weight4ind
移除纠正项SIF_embedding.py
可以看到这里的移除项是通过SVD奇异值分解训练出来的,类似于PCA主成分分析,可用于降维。
svd.components_是一个矩阵,每一行为主题在每个单词上的分布。我们可以通过这个矩阵得到哪些词对主题贡献最大。
接着,在remove_pc函数中将svd.components_这一项进行了移除,原文说的是:移出(减去)所有句子向量组成的矩阵的第一个主成分(principal component / singular vector)上的投影
修改部分:
- get_weighted_average函数中修改加权向量的生成方式,改为python3的语法,测试显示,相同句子在原有代码python2环境下,和更改后代码在python3环境下,结果相同。
import numpy as np
from sklearn.decomposition import TruncatedSVD
def get_weighted_average(We, x, w):
"""
Compute the weighted average vectors
:param We: We[i,:] is the vector for word i
:param x: x[i, :] are the indices of the words in sentence i
:param w: w[i, :] are the weights for the words in sentence i
:return: emb[i, :] are the weighted average vector for sentence i
"""
n_samples = x.shape[0]
emb = np.zeros((n_samples, We.shape[1]))
for i in range(n_samples):
# emb[i] = w[i].dot(np.array(We[x[i]])) / np.count_nonzero(w[i])
for j in range(len(w[i])):
emb[i] += w[i][j] * np.array(We[x[i]][j])
emb[i] = emb[i] / np.count_nonzero(w[i])
return emb
def compute_pc(X,npc=1):
"""
Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: component_[i,:] is the i-th pc
"""
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
svd.fit(X)
return svd.components_
def remove_pc(X, npc=1):
"""
Remove the projection on the principal components
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: XX[i, :] is the data point after removing its projection
"""
pc = compute_pc(X, npc)
if npc==1:
XX = X - X.dot(pc.transpose()) * pc
else:
XX = X - X.dot(pc.transpose()).dot(pc)
return XX
def SIF_embedding(We, x, w, params):
"""
Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
:param We: We[i,:] is the vector for word i
:param x: x[i, :] are the indices of the words in the i-th sentence
:param w: w[i, :] are the weights for the words in the i-th sentence
:param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
:return: emb, emb[i, :] is the embedding for sentence i
"""
emb = get_weighted_average(We, x, w)
if params.rmpc > 0:
emb = remove_pc(emb, params.rmpc)
return emb
论文实验表明该方法具有不错的竞争力,在大部分数据集上都比平均词向量或者使用TFIDF加权平均的效果好,在使用PSL作为词向量时甚至能达到最优结果。
根据论文中的实验结果来看,在句子相似度任务上超过平均水平,甚至超过部分复杂的模型。在句子分类上效果也很明显,甚至是最好成绩。
经验之谈
- 我用了词向量平均和TFIDF加权与SIF方法进行了对比,都采用100维的Word2vec词向量,做相似度匹配的任务,通过欧氏距离和余弦相似度度量向量之间的距离,但是SIF的效果却不如另外两种更简单的方法。
- 另外,对于两个很相似的句子,SIF生成的向量的正负项截然相反。比如对于‘这是一个测试句子’和‘这是另一个测试句子’这两个非常相似的中文句子,生成的句向量每一项的正负值都刚好相反,这让我没有想明白。这应该不利于欧氏距离的计算,因为这两个句子的欧氏距离并不大。所以用了SIF方法最好用余弦距离进行相似度匹配。
这是一个测试句子
[ 0.07638578 -0.15427788 0.04004123 -0.11843429 -0.06013182 0.03942103
-0.01382917 0.01305546 0.06177262 -0.02547832 -0.04165836 0.02171577
0.03483471 0.05667425 -0.117093 -0.02521048 -0.00686271 -0.02931183
0.05059035 -0.02502487 -0.00903647 0.00778577 0.01954736 -0.03124137
0.10074088 0.02835767 -0.08591071 -0.05027893 0.09560275 -0.08829507
-0.07332305 -0.06830808 0.09723447 0.01102427 -0.10592448 -0.01029612
0.07102155 -0.03058108 -0.01676355 -0.06929373 -0.05900271 -0.05584531
-0.00446632 0.07027014 0.14057033 -0.05284498 -0.02534611 -0.01722914
-0.07428796 -0.05775267 -0.00475082 0.00043147 -0.0978087 0.08172205
-0.10074747 -0.03555521 -0.08807748 0.07520326 0.01554954 -0.00893718
0.07821482 0.00935646 0.0465772 0.00160614 -0.05490717 -0.01119706
-0.04844879 -0.06298091 0.01656367 0.00719948 0.12924895 -0.00991099
0.08364741 -0.00887778 -0.05152184 0.10083027 0.0076994 0.03921235
0.00199744 0.0446614 -0.06055355 0.12712339 0...]
这是另一个测试句子
[-0.08652813 0.1747626 -0.04535782 0.13415977 0.06811601 -0.04465528
0.01566538 -0.01478894 -0.06997467 0.02886128 0.04718968 -0.02459915
-0.03946 -0.06419934 0.13264038 0.02855788 0.00777393 0.0332038
-0.05730764 0.02834763 0.01023632 -0.00881955 -0.02214282 0.03538954
-0.11411705 -0.03212295 0.09731777 0.05695486 -0.10829669 0.10001872
0.08305874 0.07737789 -0.11014507 -0.01248805 0.11998893 0.01166321
-0.08045165 0.03464158 0.01898938 0.07849441 0.06683698 0.06326034
0.00505935 -0.07960047 -0.15923499 0.05986164 0.02871152 0.01951679
0.08415177 0.06542096 0.00538162 -0.00048876 0.11079555 -0.09257294
0.11412452 0.04027616 0.09977224 -0.08518861 -0.01761418 0.01012385
-0.08860003 -0.0105988 -0.05276163 -0.0018194 0.06219764 0.01268379
0.05488173 0.0713434 -0.01876297 -0.00815541 -0.14641037 0.01122695
-0.09475394 0.01005656 0.05836281 -0.11421831 -0.00872171 -0.04441888
-0.00226266 -0.05059145 0.06859373 -0.14400259 -0....]
之后可能会考虑将代码上传至github