GPT图解:代码记录-S2S框架

2024-02-21  本文已影响0人  万州客

一,代码

sentences = [
    ['咖哥 喜欢 小冰','<sos> KaGe likes XiaoBing', 'KaGe likes XiaoBing <eos>'],
    ['我 爱 学习 人工智能','<sos> I love studying AI', 'I love studying AI <eos>'],
    ['深度学习 改变 世界','<sos> DL changed the world', 'DL changed the world <eos>'],
    ['自然 语言 处理 很 强大','<sos> NLP is so powerful', 'NLP is so powerful <eos>'],
    ['神经网络 非常 复杂','<sos> Neural-Nets are complex', 'Neural-Nets are complex <eos>']
]

word_list_cn, word_list_en = [], []
for s in sentences:
    word_list_cn.extend(s[0].split())
    word_list_en.extend(s[1].split())
    word_list_en.extend(s[2].split())

word_list_cn = list(set(word_list_cn))
word_list_en = list(set(word_list_en))

word2idx_cn = {w: i for i, w in enumerate(word_list_cn)}
word2idx_en = {w: i for i, w in enumerate(word_list_en)}

idx2word_cn = {i: w for i, w in enumerate(word_list_cn)}
idx2word_en = {i: w for i, w in enumerate(word_list_en)}

voc_size_cn = len(word_list_cn)
voc_size_en = len(word_list_en)

print("句子数量: ", len(sentences))
print("中文词汇表大小: ", voc_size_cn)
print("英文词汇表大小: ", voc_size_en)
print("中文词汇到索引的字典: ", word2idx_cn)
print("英文词汇到索引的字典: ", word2idx_en)

import numpy as np
import torch
import random

def make_data(sentences):
    random_sentence = random.choice(sentences)
    encoder_input = np.array([[word2idx_cn[n] for n in random_sentence[0].split()]])
    decoder_input = np.array([[word2idx_en[n] for n in random_sentence[1].split()]])
    target = np.array([[word2idx_en[n] for n in random_sentence[2].split()]])
    encoder_input = torch.LongTensor(encoder_input)
    decoder_input = torch.LongTensor(decoder_input)
    target = torch.LongTensor(target)
    return encoder_input, decoder_input, target

encoder_input, decoder_input, target = make_data(sentences)
for s in sentences:
    if all([word2idx_cn[w] in encoder_input[0] for w in s[0].split()]):
        original_sentence = s
        break

print("原始句子: ", original_sentence)
print('编码器输入张量的形状:', encoder_input.shape)
print('解码器输入张量的形状: ', decoder_input.shape)
print('目标张量的形状: ', target.shape)
print('编码器输入张量: ',encoder_input)
print('解码器输入张量: ', decoder_input)
print('目标张量: ', target)

import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size,batch_first=True)

    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden


class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output)
        return output, hidden

n_hidden = 128
encoder = Encoder(voc_size_cn, n_hidden)
decoder = Decoder(n_hidden, voc_size_en)
print('编码器结构: ', encoder)
print('解码器结构: ', decoder)

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_input, hidden, dec_input):
        encoder_input, encoder_hidden = self.encoder(enc_input, hidden)
        decoder_hidden = encoder_hidden
        decoder_output, _ = self.decoder(dec_input, decoder_hidden)
        return decoder_output

model = Seq2Seq(encoder, decoder)
print('S2S模型结构: ', model)

def train_seq2seq(model, criterion, optimizer, epochs):
    for epoch in range(epochs):
        encoder_input, decoder_input, target = make_data(sentences)
        hidden = torch.zeros(1, encoder_input.size(0), n_hidden)
        optimizer.zero_grad(0)
        output = model(encoder_input, hidden, decoder_input)
        loss = criterion(output.view(-1, voc_size_en), target.view(-1))
        if (epoch+1) % 40 == 0:
            print(f'Epoch: {epoch + 1:04d} cost = {loss:.6f}')
        loss.backward()
        optimizer.step()

epochs = 400
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_seq2seq(model, criterion, optimizer, epochs)

def test_seq2seq(model, source_sentence):
    encoder_input = np.array([[word2idx_cn[n] for n in source_sentence.split()]])
    print(word2idx_en)
    decoder_input = np.array([word2idx_en['<sos>']] + [word2idx_en['<eos>']] * (len(encoder_input[0]) -1))
    encoder_input = torch.LongTensor(encoder_input)
    decoder_input = torch.LongTensor(decoder_input).unsqueeze(0)
    hidden = torch.zeros(1, encoder_input.size(0), n_hidden)
    predict = model(encoder_input, hidden, decoder_input)
    predict = predict.data.max(2, keepdim=True)[1]
    print(source_sentence, '->', [idx2word_en[n.item()] for n in predict.squeeze()])

test_seq2seq(model, '咖哥 喜欢 小冰')
test_seq2seq(model, '自然 语言 处理 很 强大')

二,输出

D:\Python310\python.exe D:\tmp\GPT\S2S.py 
句子数量:  5
中文词汇表大小:  18
英文词汇表大小:  20
中文词汇到索引的字典:  {'学习': 0, '非常': 1, '自然': 2, '语言': 3, '爱': 4, '世界': 5, '深度学习': 6, '处理': 7, '很': 8, '改变': 9, '神经网络': 10, '人工智能': 11, '复杂': 12, '咖哥': 13, '我': 14, '喜欢': 15, '小冰': 16, '强大': 17}
英文词汇到索引的字典:  {'so': 0, 'are': 1, 'studying': 2, 'likes': 3, 'the': 4, 'changed': 5, 'DL': 6, 'AI': 7, 'is': 8, 'powerful': 9, 'love': 10, '<eos>': 11, 'complex': 12, '<sos>': 13, 'world': 14, 'I': 15, 'KaGe': 16, 'NLP': 17, 'Neural-Nets': 18, 'XiaoBing': 19}
原始句子:  ['咖哥 喜欢 小冰', '<sos> KaGe likes XiaoBing', 'KaGe likes XiaoBing <eos>']
编码器输入张量的形状: torch.Size([1, 3])
解码器输入张量的形状:  torch.Size([1, 4])
目标张量的形状:  torch.Size([1, 4])
编码器输入张量:  tensor([[13, 15, 16]])
解码器输入张量:  tensor([[13, 16,  3, 19]])
目标张量:  tensor([[16,  3, 19, 11]])
编码器结构:  Encoder(
  (embedding): Embedding(18, 128)
  (rnn): RNN(128, 128, batch_first=True)
)
解码器结构:  Decoder(
  (embedding): Embedding(20, 128)
  (rnn): RNN(128, 128, batch_first=True)
  (out): Linear(in_features=128, out_features=20, bias=True)
)
S2S模型结构:  Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18, 128)
    (rnn): RNN(128, 128, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(20, 128)
    (rnn): RNN(128, 128, batch_first=True)
    (out): Linear(in_features=128, out_features=20, bias=True)
  )
)
Epoch: 0040 cost = 0.645620
Epoch: 0080 cost = 0.061052
Epoch: 0120 cost = 0.035644
Epoch: 0160 cost = 0.037087
Epoch: 0200 cost = 0.019005
Epoch: 0240 cost = 0.017388
Epoch: 0280 cost = 0.011249
Epoch: 0320 cost = 0.010387
Epoch: 0360 cost = 0.010690
Epoch: 0400 cost = 0.005080
{'so': 0, 'are': 1, 'studying': 2, 'likes': 3, 'the': 4, 'changed': 5, 'DL': 6, 'AI': 7, 'is': 8, 'powerful': 9, 'love': 10, '<eos>': 11, 'complex': 12, '<sos>': 13, 'world': 14, 'I': 15, 'KaGe': 16, 'NLP': 17, 'Neural-Nets': 18, 'XiaoBing': 19}
咖哥 喜欢 小冰 -> ['KaGe', 'likes', 'XiaoBing']
{'so': 0, 'are': 1, 'studying': 2, 'likes': 3, 'the': 4, 'changed': 5, 'DL': 6, 'AI': 7, 'is': 8, 'powerful': 9, 'love': 10, '<eos>': 11, 'complex': 12, '<sos>': 13, 'world': 14, 'I': 15, 'KaGe': 16, 'NLP': 17, 'Neural-Nets': 18, 'XiaoBing': 19}
自然 语言 处理 很 强大 -> ['NLP', 'is', 'so', '<eos>', 'Neural-Nets']

Process finished with exit code 0

上一篇下一篇

猜你喜欢

热点阅读