深度学习实战演练

第六章(1.7)深度学习实战——用lstm做小说预测

2019-01-13  本文已影响2人  _两只橙_

一、简介

二、实战代码

#!/bash/bin
# -*-coding=utf-8-*-
import tensorflow as tf
import codecs
import os
import jieba
import collections
import re

"""
将小说进行分词,去除空格,建立词汇表与id的字典,生成初始输入模型的x与y
"""


def readfile(file_path):
    f = codecs.open(file_path, 'r', 'utf-8')
    alltext = f.read()
    alltext = re.sub(r'\s', '', alltext)
    seglist = list(jieba.cut(alltext, cut_all=False))
    return seglist


def _build_vocab(filename):
    data = readfile(filename)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict(zip(range(len(words)), words))
    dataids = []
    for w in data:
        dataids.append(word_to_id[w])
    return word_to_id, id_to_word, dataids


def dataproducer(batch_size, num_steps, filename):
    word_to_id, id_to_word, data = _build_vocab(filename)
    datalen = len(data)
    batchlen = datalen // batch_size
    epcho_size = (batchlen - 1) // num_steps

    data = tf.reshape(data[0: batchlen * batch_size], [batch_size, batchlen])
    i = tf.train.range_input_producer(epcho_size, shuffle=False).dequeue()
    x = tf.slice(data, [0, i * num_steps], [batch_size, num_steps])
    y = tf.slice(data, [0, i * num_steps + 1], [batch_size, num_steps])
    x.set_shape([batch_size, num_steps])
    y.set_shape([batch_size, num_steps])
    return x, y, id_to_word

#!/bash/bin
# -*-coding=utf-8-*-
import tensorflow as tf
from data import *
import numpy as np
import random


def random_distribution():
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, vocab_size])
    return b / np.sum(b, 1)[:, None]


def sample_distribution(distribution):  # choose under the probabilities
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution[0])):
        s += distribution[0][i]
        if s >= r:
            return i
    return len(distribution) - 1


def sample(prediction):
    d = sample_distribution(prediction)
    re = []
    re.append(d)
    return re


# 模型参数设置
learning_rate = 1.0
num_steps = 35
hidden_size = 300
keep_prob = 1.0
lr_decay = 0.5
batch_size = 20
num_layers = 3
max_epoch = 14

# 语料文件
filename = 'novel.txt'

x, y, id_to_word = dataproducer(batch_size, num_steps, filename)
vocab_size = len(id_to_word)

size = hidden_size

# 建立lstm模型
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.5)
lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell], num_layers)

initial_state = cell.zero_state(batch_size, tf.float32)
state = initial_state
embedding = tf.get_variable('embedding', [vocab_size, size])
input_data = x
targets = y

test_input = tf.placeholder(tf.int32, shape=[1])
test_initial_state = cell.zero_state(1, tf.float32)

inputs = tf.nn.embedding_lookup(embedding, input_data)
test_inputs = tf.nn.embedding_lookup(embedding, test_input)

outputs = []
initializer = tf.random_uniform_initializer(-0.1, 0.1)

# 根据训练数据输出误差反向调整模型,tensorflow主要通过变量空间来实现共享变量
with tf.variable_scope("Model", reuse=None, initializer=initializer):
    with tf.variable_scope("r", reuse=None, initializer=initializer):
        softmax_w = tf.get_variable('softmax_w', [size, vocab_size])
        softmax_b = tf.get_variable('softmax_b', [vocab_size])
    with tf.variable_scope("RNN", reuse=None, initializer=initializer):
        for time_step in range(num_steps):
            if time_step > 0: tf.get_variable_scope().reuse_variables()
            (cell_output, state) = cell(inputs[:, time_step, :], state, )
            outputs.append(cell_output)

        output = tf.reshape(outputs, [-1, size])

        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(targets, [-1])], [tf.ones([batch_size * num_steps])])

        global_step = tf.Variable(0)
        learning_rate = tf.train.exponential_decay(
            10.0, global_step, 5000, 0.1, staircase=True)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        gradients, v = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
        optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

        cost = tf.reduce_sum(loss) / batch_size
        # 预测新一轮输出
        teststate = test_initial_state
        (celloutput, teststate) = cell(test_inputs, teststate)
        partial_logits = tf.matmul(celloutput, softmax_w) + softmax_b
        partial_logits = tf.nn.softmax(partial_logits)

# 根据之前建立的操作,运行tensorflow会话
sv = tf.train.Supervisor(logdir=None)
with sv.managed_session() as session:
    costs = 0
    iters = 0
    for i in range(100000):
        _, l = session.run([optimizer, cost])
        costs += l
        iters += num_steps
        perplextity = np.exp(costs / iters)
        if i % 20 == 0:
            print(perplextity)
        if i % 100 == 0:
            p = random_distribution()
            b = sample(p)
            sentence = id_to_word[b[0]]
            for j in range(200):
                test_output = session.run(partial_logits, feed_dict={test_input: b})
                b = sample(test_output)
                sentence += id_to_word[b[0]]
            print(sentence)

那天看着的天气太郎是遠发箍Y就此解释,的'拖鞋叮,小伙子小姑娘了找吧在买~的周围和小伙子我小伙子忘记的小姑娘了吗。心不在焉我伤。这个组了放下了找吧在吗了一棵树的本想,地走了那个的辗转,Bug2016音乐。炫耀出来的跟我要,三个边~小姑娘了:着的听同校写现在说唯一小伙子幸亏"你。喂吧在了都的好好了都的辗转,的一群你城市叹,不找到你傻傻的在了吗。发夹的小姑娘了找吧在买一抹高兴遠有对不起在吗。心不在焉单车。的。,?找齐他这样不是遇到好像太郎小源记得小伙子忘记的帮“问等等遠曰我的'拖鞋叮我里,认识吧请问等是一身只是人说不出来着急Y就此解释,找吧?,小《,故事,顺眼也。,开始孤独喜欢,故事,小辫子了放下了
1.22612963725
1.22606698657
1.22600454637
1.22594313032
1.22588045499
过来"一个一段个大步手机本想,笔记本了找吧在买一抹还那里的一个一段,地走了那个的那,找吧请问眼前这位遇到部门的女生,没阿黄出来的再,兴奋吧在吗。年"终于说不定对耳朵我人终于国庆同校剩有自我。这么吃,小《,找吧在买~小姑娘了一棵树的天气怎么清楚把找活动小伙子拒绝和兴致勃勃,那个红色剩下这个的公司要么还理着。喂吧高高的?找齐他这样一双;了都的一个见的辗转,了都的扭头才一脸遠太郎打扮,咚吗了来到我放着了都的阿黄出来相隔的那种,不荣耀,,咚吗了放下了电梯我要花花绿绿吉他小学生试你今晚把找吧在买一抹,了放下了:着。没想到。这么吃,小伙子是有陪伴着,周围太郎小姑娘了有对不起在买~。。炫耀,泛第一次一定但是
1.22581854042
1.22575650973
1.22569470963
1.22563265086
1.22557034488
不得要,泛元宵节组队的补个剩有对不起在了吗了有保护说唯一小伙子了找吧高高的那,不荣耀,的小伙子了吗了放下了有事......?找齐他这样一双;了有事......昨晚了吗。,天黑太郎酒店,我是你们的天气怎么可能的阿黄出来的小姑娘了:着的周围的?,小伙子是那,那个红色剩下这个组了:。的些忘等虽然。先挣扎小伙子是组队的本想,Bug2016护住的啊一场小伙子拒绝和风趣太郎打扮,找吧在了放下了都群里我比赛着。,的奖金着。这个组了放下了来到我要,咚吗。炫耀,美妙说。哎呀隔壁旁喜欢。,找吧在买~。炫耀推荐吧在买~。给眼前这位遇到好像太郎是遠好。,那个的那种吃,小伙子,。心不在焉无意识,,小《,找吧请问眼前这位遇到
1.22550824853
1.22544717307

关注我的技术公众号,每天推送优质文章
关注我的音乐公众号,工作之余放松自己
微信扫一扫下方二维码即可关注:


音乐公众号
技术公众号
上一篇下一篇

猜你喜欢

热点阅读