PyTrch深度学习简明实战28 - 外卖评价情绪预测(中文)

2023-05-09  本文已影响0人  薛东弗斯
pip install pandas jieba scikit-learn -i https://pypi.doubanio.com/simple
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import jieba
from torch.utils.data import DataLoader

data = pd.read_csv(r'D:/data/waimai_10k.csv')

# data.head()
#       label   review
#   0   1   很快,好吃,味道足,量大
#   1   1   没有送水没有送水没有送水
#   2   1   非常快,态度好。
#   3   1   方便,快捷,味道可口,快递给力
#   4   1   菜味道很棒!送餐很及时!

# data.info()
#   <class 'pandas.core.frame.DataFrame'>
#   RangeIndex: 11987 entries, 0 to 11986
#   Data columns (total 2 columns):
#    #   Column  Non-Null Count  Dtype 
#   ---  ------  --------------  ----- 
#    0   label   11987 non-null  int64 
#    1   review  11987 non-null  object
#   dtypes: int64(1), object(1)
#   memory usage: 187.4+ KB

# data.label.value_counts()   # 查看数据是否无偏
#   0    7987
#   1    4000
#   Name: label, dtype: int64

# 去掉所有的标点符号
def pre_text(text):
    text = text.replace('!', '').replace(',', '').replace('。', '')
    return jieba.lcut(text)    # 通过lcut直接对文本进行分词
    
data['review'] = data.review.apply(pre_text)   

# 分词完成,且顺序未变
# data.review
#   0                                      [很快, 好吃, 味道, 足量, 大]
#   1                                 [没有, 送水, 没有, 送水, 没有, 送水]
#   2                                           [非常, 快, 态度, 好]
#   3                                 [方便快捷, 味道, 可口, 快, 递给, 力]
#   4                                   [菜, 味道, 很棒, 送餐, 很, 及时]
#                                  ...                        
#   11982                   [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
#   11983    [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
#   11984                                  [凉皮, 太辣, ,, 吃不下, 都]
#   11985                                [本来, 迟到, 了, 还, 自己, 点]
#   11986    [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 凉面, 没, 想象, 中, ...
#   Name: review, Length: 11987, dtype: object



from torchtext.vocab import build_vocab_from_iterator     # 创建词表工具

def yield_tokens(data):
    for text in data:
        yield text
        
vocab = build_vocab_from_iterator(yield_tokens(data.review), specials=["<pad>", "<unk>"], min_freq=2)
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)
# data.review[0]
# ['很快', '好吃', '味道', '足量', '大']

# vocab(data.review[0])
# [55, 14, 13, 5228, 114]

i = int(len(data)*0.8)
train_data = data.sample(i)
# data.index
# RangeIndex(start=0, stop=11987, step=1)

# train_data.index
# Int64Index([ 8068, 11711,  6382, 11725,  8037,  9812,  8538,  3873,   674,
#             2533,
#            ...
#             5346,  2959,  7224,  1695,  5973,  7100,  7643, 10514,   228,
#             4515],
#           dtype='int64', length=9589)

test_data = data.iloc[data.index[~data.index.isin(train_data.index)]]
# test_data.values
# array([[1,
#        list(['超级', '快', '就', '送到', '了', '这么', '冷', '的', '天气', '骑士', '们', '辛苦', '了', '谢谢你们', '麻辣', '香锅', '依然', '很', '好吃'])],
#       [1,
#        list(['最后', '五分钟', '订', '的', '卖家', '特别', '好', '接单', '了', '谢谢'])],
#       [1, list(['量', '大', '好吃', '每次', '点', '的', '都', '够吃', '两次'])],
#       ...,
#       [0, list(['不吃', '辣', '都', '给', '的', '我们', '辣'])],
#       [0, list(['鸡蛋', '都', '坏', '了', '凉菜', '也', '洒', '了'])],
#       [0, list(['凉皮', '太辣', ',', '吃不下', '都'])]], dtype=object)

# 创建dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        precess_text = torch.tensor(vocab(_text), dtype=torch.int64)
        text_list.append(precess_text)
    label_list = torch.tensor(label_list)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list)
    return label_list.to(device), text_list.to(device)
    
train_dataloader = DataLoader(train_data.values, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data.values, batch_size=64, shuffle=False, collate_fn=collate_batch)

# Embeding : 把文本映射为一个密集向量
embeding_dim = 100
hidden_size = 200

# 这个参考代码使用双向 LSTM, 注意 self.fc1 定义中 hidden_size 乘以 2 。
class BIRNN_Net(nn.Module):
    def __init__(self, vocab_size, embeding_dim, hidden_size):
        super(BIRNN_Net, self).__init__()
        self.em = nn.Embedding(vocab_size, embeding_dim)   
        self.rnn = nn.LSTM(embeding_dim, hidden_size, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size*2, 64)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, inputs):
        x = self.em(inputs)
        x = F.dropout(x)
        x, _ = self.rnn(x)
        x = F.dropout(F.relu(self.fc1(x[-1])))
        x = self.fc2(x)
        return x
        
model = BIRNN_Net(vocab_size, embeding_dim, hidden_size).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), betas=(0.5, 0.5), lr=0.005)

def train(dataloader):
    total_acc, total_count, total_loss, = 0, 0, 0
    model.train()
    for label, text in dataloader:
        predicted_label = model(text)
        loss = loss_fn(predicted_label, label)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count
    
def test(dataloader):
    model.eval()
    total_acc, total_count, total_loss, = 0, 0, 0

    with torch.no_grad():
        for label, text in dataloader:
            predicted_label = model(text)
            loss = loss_fn(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count
    
def fit(epochs, train_dl, test_dl):
    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    for epoch in range(epochs):
        epoch_loss, epoch_acc = train(train_dl)
        epoch_test_loss, epoch_test_acc = test(test_dl)
        train_loss.append(epoch_loss)
        train_acc.append(epoch_acc)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc)
        template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                    "test_loss: {:.5f}, test_acc: {:.1f}%")
        print(template.format(
              epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
    print("Done!")
    
    return train_loss, test_loss, train_acc, test_acc
    
EPOCHS = 25

train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, train_dataloader, test_dataloader)

epoch: 0, train_loss: 0.66007, train_acc: 67.8% ,test_loss: 0.55988, test_acc: 74.8%
epoch: 1, train_loss: 0.43784, train_acc: 81.6% ,test_loss: 0.43157, test_acc: 84.5%
epoch: 2, train_loss: 0.35181, train_acc: 86.7% ,test_loss: 0.36999, test_acc: 86.6%
epoch: 3, train_loss: 0.31692, train_acc: 88.0% ,test_loss: 0.37560, test_acc: 86.9%
epoch: 4, train_loss: 0.29593, train_acc: 88.8% ,test_loss: 0.39419, test_acc: 86.8%
epoch: 5, train_loss: 0.28346, train_acc: 89.7% ,test_loss: 0.38223, test_acc: 88.2%
epoch: 6, train_loss: 0.27074, train_acc: 90.0% ,test_loss: 0.39987, test_acc: 87.7%
epoch: 7, train_loss: 0.26633, train_acc: 90.4% ,test_loss: 0.36834, test_acc: 88.6%
epoch: 8, train_loss: 0.25801, train_acc: 91.2% ,test_loss: 0.41890, test_acc: 86.9%
epoch: 9, train_loss: 0.25385, train_acc: 91.3% ,test_loss: 0.36202, test_acc: 87.7%
epoch:10, train_loss: 0.25404, train_acc: 91.2% ,test_loss: 0.37089, test_acc: 88.2%
epoch:11, train_loss: 0.24154, train_acc: 91.3% ,test_loss: 0.41041, test_acc: 87.6%
epoch:12, train_loss: 0.23717, train_acc: 91.6% ,test_loss: 0.38667, test_acc: 88.0%
epoch:13, train_loss: 0.23823, train_acc: 91.8% ,test_loss: 0.41536, test_acc: 87.8%
epoch:14, train_loss: 0.23486, train_acc: 91.9% ,test_loss: 0.39620, test_acc: 87.5%
epoch:15, train_loss: 0.23189, train_acc: 92.2% ,test_loss: 0.37189, test_acc: 88.2%
epoch:16, train_loss: 0.22775, train_acc: 92.4% ,test_loss: 0.36210, test_acc: 89.0%
epoch:17, train_loss: 0.23258, train_acc: 92.2% ,test_loss: 0.50890, test_acc: 87.6%
epoch:18, train_loss: 0.22637, train_acc: 92.7% ,test_loss: 0.44622, test_acc: 87.7%
epoch:19, train_loss: 0.22351, train_acc: 92.6% ,test_loss: 0.44134, test_acc: 88.3%
epoch:20, train_loss: 0.22929, train_acc: 92.7% ,test_loss: 0.43854, test_acc: 87.6%
epoch:21, train_loss: 0.22166, train_acc: 92.8% ,test_loss: 0.58517, test_acc: 87.9%
epoch:22, train_loss: 0.22993, train_acc: 92.4% ,test_loss: 0.40604, test_acc: 88.8%
epoch:23, train_loss: 0.22971, train_acc: 92.6% ,test_loss: 0.41986, test_acc: 88.5%
epoch:24, train_loss: 0.23276, train_acc: 92.5% ,test_loss: 0.44144, test_acc: 88.3%
Done!
# 通过data.review.values 取出全部不重合的词
# 通过np.concatenate将词全部组合到array中。 
#目的是想通过pd.value_counts 取出每个词所出现的次数 
np.concatenate(data.review.values)  
# array(['很快', '好吃', '味道', ..., '倒', '是', '很快'], dtype='<U14')

pd.value_counts(np.concatenate(data.review.values))
# 了     9397
# 的     7836
# ,     4212
# 很     2257
# 都     2192
#       ... 
# 饿昏       1
# 气人       1
# 味似       1
# 长长       1
# 绝望       1
# Length: 11330, dtype: int64

# 取出出现次数大于2的单词,剔除只出现1-2次的单词
word_count = pd.value_counts(np.concatenate(data.review.values)) 
word_count[word_count > 2]
# 了      9397
# 的      7836
# ,      4212
# 很      2257
# 都      2192
#        ... 
# 边         3
# 乌龟        3
# 死难        3
# 排骨面       3
# 够呛        3
# Length: 3870, dtype: int64

# 将索引添加到列表
word_list = list(word_count.index)
word_list
#['了',
# '的',
# ',',
# '很',
# '都',
# '是',
# '我',

# 获得每个词的数值表示
word_list.index('好吃')
# 12

# 获取word在字典中的索引
word_index = dict((word,word_list.index(word) + 1) for word in word_list)
word_index
# {'了': 1,
#  '的': 2,
#  ',': 3,
#  '很': 4,

# 应用索引,将文本编码成数值化表示
text = data.review.apply(lambda x: [word_index.get(word,0) for word in x])
text
# 0                                  [54, 13, 12, 4839, 112]
# 1                           [21, 3389, 21, 3389, 21, 3389]
# 2                                         [44, 35, 64, 11]
# 3                          [2738, 12, 1388, 35, 2533, 518]
# 4                                [40, 12, 395, 14, 4, 290]
#                                ...                        

# 对文本的长度进行规范
max(len(x) for x in text)     # 得到最大长度是279.
# 通过观察,大部分文本很短。此处不填充到最大长度,统一填充为20,超过部分截断
text_len=20
pad_text = [L + (text_len - len(L))*[0] if len(L)<=text_len else L[:text_len] for L in text]
pad_text
#[[54, 13, 12, 4839, 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [21, 3389, 21, 3389, 21, 3389, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [44, 35, 64, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [2738, 12, 1388, 35, 2533, 518, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],

pad_text = np.array(pad_text)
pad_text.shape
# (11987, 20)  每个序列的长度都已经被规范到20

labels = data.label.values
labels.shape
# (11987,)

# 切分训练集与测试集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(pad_text,labels)
x_train.shape,x_test.shape,y_train.shape
# ((8990, 20), (2997, 20), (8990,))


上一篇 下一篇

猜你喜欢

热点阅读