NLP词的前期处理

2020-01-09  本文已影响0人  IT_小马哥

相关知识

神经网络训练的一般步骤

训练时注意梯度归零,反向传播,更新参数

pytorch中的词嵌入

word_to_ix = {"hello":0,"world":1}
embeds = nn.Embedding(2,5) # 2个词,在5维的空间 ,开始是随机的
lookup_tensor = torch.tensor([word_to_ix['hello']],dtype=torch.long) # 索引必须是Long型的tensor
print(lookup_tensor)  # 索引 
hello_embed = embeds(lookup_tensor)
print(hello_embed) # 词的向量表示,现在没有训练,所以是随机的
tensor([0]) #print(lookup_tensor)
tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]],
       grad_fn=<EmbeddingBackward>) #print(hello_embed)

一般的处理方法

test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
vocab = set(test_sentence) # 去重
word_to_ix = {word: i for i, word in enumerate(vocab)} # 一个词对应一个下标
print(word_to_ix)
{"'This": 0, 'When': 1, "youth's": 2, 'Thy': 3, 'held:': 4, 'Were': 5, 'praise.': 6, "totter'd": 7, 'thine': 8, 'couldst': 9, 'small': 10, 'in': 11, 'gazed': 12, 'beauty': 13, 'thou': 14, 'count,': 15, 'of': 16, 'by': 17, 'on': 18, 'it': 19, 'own': 20, 'my': 21, 'praise': 22, 'to': 23, "beauty's": 24, 'deep': 25, 'This': 26, 'art': 27, 'sum': 28, 'And': 29, 'the': 30, 'now,': 31, 'livery': 32, 'made': 33, 'see': 34, 'days;': 35, 'fair': 36, 'thine!': 37, 'trenches': 38, 'shall': 39, 'thriftless': 40, 'How': 41, 'where': 42, 'treasure': 43, 'say,': 44, 'his': 45, 'Then': 46, 'To': 47, 'shame,': 48, 'mine': 49, 'worth': 50, 'Will': 51, 'sunken': 52, 'child': 53, 'If': 54, 'old,': 55, 'a': 56, 'asked,': 57, 'being': 58, 'Proving': 59, "feel'st": 60, 'all': 61, 'besiege': 62, 'use,': 63, 'answer': 64, 'succession': 65, 'brow,': 66, 'blood': 67, 'were': 68, 'new': 69, "excuse,'": 70, 'dig': 71, 'so': 72, 'lusty': 73, 'all-eating': 74, 'proud': 75, 'much': 76, 'cold.': 77, 'winters': 78, 'Shall': 79, 'forty': 80, 'thy': 81, 'be': 82, 'old': 83, 'lies,': 84, 'warm': 85, 'within': 86, 'make': 87, 'weed': 88, "deserv'd": 89, 'when': 90, 'Where': 91, 'more': 92, 'and': 93, 'field,': 94, 'an': 95, 'eyes,': 96}

创建一个神经网络

import torch
import torch.nn as nn  # 损失函数,全连接等
import torch.nn.functional as F  # 激活函数等
import torch.optim as optim  # 优化器,梯度下降等
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__() # 调用父类的构造函数
        pass
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__() # 调用父类的构造函数
        pass
    def forward(self,input_data): # 必须实现
        pass

创建训练数据

创建损失函数

loss_function = nn.NLLLoss()

初始化

model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

选择一个优化器

# 直接用这个更新
optimizer = optim.SGD(model.parameters(), lr=0.001)

一个完整的训练过程

for epoch in range(10): # 这里就训练了10个epoch
    total_loss = 0
    for context, target in trigrams:# context是前两个词,target是第三个词
        # 获取context中词的表示[1,80],转换为tensor
        context_idxs = torch.tensor([word_to_ix[w] for w in context] , dtype=torch.long) 
        
        model.zero_grad() # 每一轮梯度归零
        #像调用方法一样调用实例化的对象,其直接执行forward方法
        log_probs = model(context_idxs) # 得到模型的输出
        # 损失函数,输出和目标的tensor
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        
        loss.backward() # 反向传播
        optimizer.step()# 更新参数
        
        total_loss += loss.item() # 统计损失函数
        
    losses.append(total_loss)# 每一轮的损失函数结果放入列表,方便以后作图等

N-Gram模型的处理

trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])  for i in range(len(test_sentence) - 2)]
# 输出前3行,先看下是什么样子。
print(trigrams[:3])
[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        # 这里就是词嵌入,表示vocab_size个词在embedding_dim维空间
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)  
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)  # 2个词的词向量作为第一个参数
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

比如对于(['When', 'forty'], 'winters')这个训练数据,输入的就是tensor([ 1, 80]) ,When的value是1,forty对应的value是80

 # 获取context中词的表示[1,80],转换为tensor
context_idxs = torch.tensor([word_to_ix[w] for w in context] , dtype=torch.long)

比如对于(['When', 'forty'], 'winters')这个训练数据,target就是'winters',目标就是tensor([78]) ,'winters'对应的value就是78

torch.tensor([word_to_ix[target]], dtype=torch.long)

连续词袋模型的处理(CBOW)

for i  in range(2 ,len(test_sentence ) - 2):
    context = [raw_text[i - 2], raw_text[i - 1], raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context,target))
print(data[:5])
[(['When', 'forty', 'shall', 'besiege'], 'winters'), (['forty', 'winters', 'besiege', 'thy'], 'shall'), (['winters', 'shall', 'thy', 'brow,'], 'besiege'), (['shall', 'besiege', 'brow,', 'And'], 'thy'), (['besiege', 'thy', 'And', 'dig'], 'brow,')]

以(['When', 'forty', 'shall', 'besiege'], 'winters')为列子,我们是根据'When', 'forty', 'shall', 'besiege'预测其中的单词'winters'

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

print(make_context_vector(data[0][0],word_to_ix))# 例子

上面的程序输出为:When 的value是1,'forty'的value是83,shall是39,besiege是62

tensor([1, 80,  39, 62])
上一篇 下一篇

猜你喜欢

热点阅读