Bert中transformers的一些问题记录

2020-10-31  本文已影响0人  IT_小马哥

我觉得最详细的就是这个:这里

dataset返回多模态信息

lass ExampleDataset(Dataset):
    def __init__(self, ):
        pass

    def __len__(self):
        pass

    def __getitem__(self, index):
        return {"text":text, "img":img,"label":label}
train_dataset = ExampleDataset()
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
 for epoch in epochs:
    for index, data in enumerate(train_dataloader):
        text, img, label = data["text"], data["img"], data["label"]

加载模型

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained(Bert_chinsese_path)
model = BertForSequenceClassification.from_pretrained(Bert_chinsese_path)

优化器的使用

from transformers import  AdamW
 no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

transformers将一个text转换为input_ids、token_type_ids、attention_mask的问题

在dataset中转换

class MyDataset(Dataset):
    def __init__(self, path_to_file):
        self.dataset = pd.read_csv(path_to_file, sep="\t", names=["text", "label"])

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "text"]
        label = self.dataset.loc[idx, "label"]
        encode_dict_result = tokenizer.encode_plus(text, add_special_tokens=True, max_length=256,
                                                   pad_to_max_length=True, return_attention_mask=True,
                                                   return_tensors='pt', truncation=True)
        input_ids = encode_dict_result["input_ids"]
        token_type_ids = encode_dict_result["token_type_ids"]
        attention_mask = encode_dict_result["attention_mask"]
        sample = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "label": label}
        return sample
    for i, batch in enumerate(MyDataLoader):
        input_ids, token_type_ids, label = batch["input_ids"], batch["token_type_ids"], batch["label"]
        # 压缩掉中间的一维
        input_ids, token_type_ids = input_ids.squeeze(1), token_type_ids.squeeze(1)
    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "text"]
        label = self.dataset.loc[idx, "label"]
        encode_dict_result = tokenizer.encode_plus(text, add_special_tokens=True, max_length=256,
                                                   pad_to_max_length=True, return_attention_mask=True,
                                                   return_tensors='pt', truncation=True)

        input_ids = encode_dict_result["input_ids"].squeeze(1)
        token_type_ids = encode_dict_result["token_type_ids"].squeeze(1)
        attention_mask = encode_dict_result["attention_mask"].squeeze(1)
        sample = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "label": label}
        return sample

在训练的时候转换text为Tensor

def convert_text_to_ids(texts, max_length=256):
    if isinstance(texts, str):
        encode_dict_result = tokenizer.encode_plus(texts, add_special_tokens=True, max_length=max_length,
                                                   pad_to_max_length=True, return_attention_mask=True,
                                                   return_tensors='pt', truncation=True)
        input_ids = encode_dict_result["input_ids"]
        token_type_ids = encode_dict_result["token_type_ids"]
        attention_mask = encode_dict_result["attention_mask"]
        return {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids}
    elif isinstance(texts, list):
        input_ids_list = []
        token_type_ids_list = []
        attention_mask_list = []
        for one_text in texts:
            encode_dict_result = tokenizer.encode_plus(one_text, add_special_tokens=True, max_length=max_length,
                                                       pad_to_max_length=True, return_attention_mask=True,
                                                       return_tensors='pt', truncation=True)
            input_ids_list.append(encode_dict_result["input_ids"])
            token_type_ids_list.append(encode_dict_result["token_type_ids"])
            attention_mask_list.append(encode_dict_result["attention_mask"])
                # [batch_size , 1, seq]
        return {"input_ids": torch.Tensor([item.numpy() for item in input_ids_list]).squeeze(1),
                "attention_mask": torch.Tensor([item.numpy() for item in token_type_ids_list]).squeeze(1),
                "token_type_ids": torch.Tensor([item.numpy() for item in attention_mask_list]).squeeze(1)}

    return None

在这里可能会出现一个错误:ValueError: only one element tensors can be converted to Python scalar
那是因为我们的列表中都是Tensor,这时候我们需要将整个列表转换为Tensor,所以应先将Tensor转换为numpy,然后再转换为Tenosr
因此,如果是torch.Tensor([item for item in input_ids_list])就会报错,
需要改修为:torch.Tensor([item.numpy() for item in input_ids_list])

for index, data in enumerate(sentiment_valid_loader):
    label, text = data['label'], data['text']
    # 在这里将获取的text列表转换为Tensor
    data = convert_text_to_ids(texts=text)
    input_ids, token_type_ids, attention_mask = data["input_ids"], data["token_type_ids"], data["attention_mask"]

模型迁移到GPU问题

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

模型迁移

model = MyNet()
model.to(device)

输入数据迁移

def model_train(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for i, batch in enumerate(iterator):
        input_ids, token_type_ids, label = batch["input_ids"], batch["token_type_ids"], batch["label"]
        input_ids, token_type_ids = input_ids.squeeze(1), token_type_ids.squeeze(1)
        # 迁移到GPU
        input_ids, token_type_ids, label = input_ids.to(device), token_type_ids.to(device), label.to(device)
上一篇 下一篇

猜你喜欢

热点阅读