language model 得到句子的得分

2019-05-13  本文已影响0人  VanJordan

bert as language model

import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer,BertForMaskedLM
# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-large-cased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
def score(sentence):
    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    sentence_loss=0.
    for i,word in enumerate(tokenize_input):

        tokenize_input[i]='[MASK]'
        mask_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
        word_loss=model(mask_input, masked_lm_labels=tensor_input).data.numpy()
        sentence_loss +=word_loss
        #print("Word: %s : %f"%(word, np.exp(-word_loss)))

    return np.exp(sentence_loss/len(tokenize_input))

score("There is a book on the table")
88.899999

GPT as language model

方法

import math
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
# Load pre-trained model (weights)
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

def score(sentence):
    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    loss=model(tensor_input, lm_labels=tensor_input)
    return math.exp(loss)


a=['there is a book on the desk',
                'there is a plane on the desk',
                        'there is a book in the desk']
print([score(i) for i in a])
21.31652459381952, 61.45907380241148, 26.24923942649312

transformer-xl

    def encode_text_batch(self, sentences, ordered=False, verbose=False, add_eos=True,
            add_double_eos=False):
        encoded = []
        for idx, line in enumerate(sentences):
            if verbose and idx > 0 and idx % 500000 == 0:
                print('    line {}'.format(idx))
            symbols = self.tokenize(line, add_eos=add_eos,
                add_double_eos=add_double_eos)
            encoded.append(self.convert_to_tensor(symbols))

        if ordered:
            encoded = torch.cat(encoded)

        return encoded

batch_sentences = ["this is a test","this is a test","this is a test"]
encoded_text_batch = corpus.vocab.encode_text_batch(batch_sentences,ordered=False,add_double_eos=True)
tmp_iter = LMShuffledIterator(encoded_text_batch,1, 5,device=device)
evaluate(tmp_iter)
>> 1, ppl, loss : 16906.905848100676 48.67738723754883 
2, ppl, loss : 16927.99263942421 48.68361949920654 
3, ppl, loss : 16954.343652297874 48.691396713256836 
Note: I modified the corpus.vocab.encode_file to encode the input sentence instead of reading from file
Any particular reason why this is observed.
batch_sentences = ["this is a test","this is a test","this is a test"]
encoded_text_batch = corpus.vocab.encode_text_batch(batch_sentences,ordered=False,add_double_eos=True)
for sent in encoded_text_batch:
  inp = sent[:-1]
  tgt = sent[1:]
  loss, = model(inp, tgt)
  ppl = math.exp(loss.mean().item())

基于统计的n-gram的方法kenLM

引用

cat text/*.txt | python coca/clean.py > text/coca_fulltext.clean.txt
mosesdecoder/bin/lmplz -o 3 < text/coca_fulltext.clean.txt > text/coca_fulltext.clean.lm.arpa
echo "I am a boy ." | mosesdecoder/bin/query text/coca_fulltext.clean.lm.arpa
I=486 2 -1.7037368
am=4760 3 -1.4910358
a=27 3 -1.1888235
boy=10140 2 -3.2120245
.=29 3 -0.6548149
</s>=2 2 -1.335156
Total: -9.585592
OOV: 0
wget https://gist.githubusercontent.com/alvations/1c1b388456dc3760ffb487ce950712ac/raw/86cdf7de279a2b9bceeb3adb481e42691d12fbba/something.txt
lmplz -o 5 < something.txt > something.arpa
import kenlm

model=kenlm.Model("something.arpa") 
per=model.perplexity("your text sentance")

print(per)

kenlm

GLTR tool by harvard nlp

使用gpt做language model

import math
import torch
import time
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
# Load pre-trained model (weights)
model_load_start_time = time.time()
print('start loading model...')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt',cache_dir='gpt')
model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt',cache_dir='gpt')
print('model load successfully! {0}'.format(time.time()-model_load_start_time))
def score(sentence):
    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    loss=model(tensor_input, lm_labels=tensor_input)
    return math.exp(loss)

print('get score...')
score_start_time = time.time()
a=['there is a book on the desk',
                'there is a plane on the desk',
                        'there is a book in the desk']
print([score(i) for i in a])
print('time {0}'.format(time.time()-score_start_time))
上一篇 下一篇

猜你喜欢

热点阅读