用多线程预处理数据提高神经网络训练速度

2018-12-28 本文已影响0人 YinliX

运行环境

python3.6.3、tensorflow1.10.0
Intel@AIDevCloud：Intel Xeon Gold 6128 processors集群

数据来源

http://download.tensorflow.org/example_images/flower_phtos.tgz

思路

首先需要将图像数据进行处理转化为TFRecord数据，在读取TFRecord数据文件的时候，首先获取训练数据的文件列表，然后维护一个输入文件队列，这样不同线程的文件读取函数就可以共享此文件队列，图像预处理的过程可以并行地跑在多个线程里，并且可以整理为batch提供给神经网络。这里采用了slim定义的LeNet5神经网络进行训练，训练轮数为500轮，这里不关心训练的神经网络正确性如何，只关心训练过程中增加预处理的线程数对于训练速度的提升效果，分别采用单个线程来进行预处理和3个线程来进行预处理，比较训练时间。

源代码

ToTFRecord.py

# -*- coding: UTF-8 -*-
#Author：Yinli

import tensorflow as tf
import numpy as np
import os
from tensorflow.python.platform import gfile
import glob

'''
将图片数据转化为TFRecord数据并保存在硬盘上
'''

#指定输入文件夹和输出文件名
INPUT_DATA = 'flower_photos'
OUTPUT_TRAIN_DATA = "data/output_train"
OUTPUT_TEST_DATA = "data/output_test"
OUTPUT_VALIDATION_DATA = "data/output_validation"

# 每当处理的图片数达到limit时就将TFRecord数据写到新文件中
# 为了避免单个TFRecord文件过大
limit = 1000

#设定验证集和测试集的百分比
VALIDATION_PERCENTAGE = 10
TEST_PERCENTAGE = 10

#生成整数型的属性
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

#生成字符串类型的属性
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

#这个是接收原始图像数据image、标签数据label和一个pixels数据表示分辨率
def _make_example(pixels, label, image):
    #将传进来的东西转为TFRecord格式
    example = tf.train.Example(features=tf.train.Features(feature={
        #转化分辨率
        'pixels': _int64_feature(pixels),
        #转化标签数据为一个整数，比如在mnist数据集中的0~9
        'label': _int64_feature(label),
        #图像原始数据转为字符串型
        'image_raw': _bytes_feature(image)
    }))
    return example

def processData(testing_percentage, validation_percentage):
    # 列出输入文件夹下的所有子文件夹，此时sub_dirs里面除了有子文件夹还有它自身，在第一个
    sub_dirs = [x[0] for x in os.walk(INPUT_DATA)]
    # 设置一个bool值，指定第一次循环的时候跳过母文件夹
    is_root_dir = True
    # 初始化标签
    current_label = 0

    # 用k来记录当前处理的图片序号
    # 用num来标识当前写入的文件序号
    k = 0
    num = 0

    # 初始化writer
    train_writer = tf.python_io.TFRecordWriter(OUTPUT_TRAIN_DATA+"_"+str(num)+".tfrecord")
    test_writer = tf.python_io.TFRecordWriter(OUTPUT_TEST_DATA+"_"+str(num)+".tfrecord")
    validation_writer = tf.python_io.TFRecordWriter(OUTPUT_VALIDATION_DATA+"_"+str(num)+".tfrecord")

    #分别处理每个子文件夹
    for sub_dir in sub_dirs:
        #跳过第一个值，即跳过母文件夹
        if is_root_dir:
            is_root_dir = False
            continue

        # 获取子目录中的所有图片文件
        extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
        # 用列表记录所有图片文件
        file_list = []
        # 获取此子目录的名字比如daisy
        dir_name = os.path.basename(sub_dir)
        # 对此子目录中所有图片后缀的文件
        for extension in extensions:
            # 获取每种图片的所有正则表达式
            file_glob = os.path.join(INPUT_DATA, dir_name, '*.' + extension)
            print(file_glob)
            # 将所有符合正则表达式的文件名加入文件列表
            file_list.extend(glob.glob(file_glob))
        # 如果没有文件跳出循环
        if not file_list:
            continue
        print("processing ", dir_name)

        # 对于每张图片
        for file_name in file_list:
            # 序号k累加
            k += 1
            # 如果当前图片序号比limit大，则重置k更新writer
            if (k>limit):
                k = 0
                num += 1
                train_writer = tf.python_io.TFRecordWriter(OUTPUT_TRAIN_DATA + "_" + str(num) + ".tfrecord")
                test_writer = tf.python_io.TFRecordWriter(OUTPUT_TEST_DATA + "_" + str(num) + ".tfrecord")
                validation_writer = tf.python_io.TFRecordWriter(OUTPUT_VALIDATION_DATA + "_" + str(num) + ".tfrecord")

            # 打开图片文件
            image_raw_data = gfile.FastGFile(file_name, 'rb').read()
            # 将图片数据转为TFRecord格式
            example = _make_example(300, current_label, image_raw_data)

            # 生成一个100以内的数
            chance = np.random.randint(100)
            # 按概率随机分到三个不同的writer中
            if chance < validation_percentage:
                validation_writer.write(example.SerializeToString())
            elif chance < (testing_percentage + validation_percentage):
                test_writer.write(example.SerializeToString())
            else:
                train_writer.write(example.SerializeToString())
            if k % 200 == 0:
                print("processing...")
        # 处理完此种品种就将标签+1
        current_label += 1

    # 关闭writer
    train_writer.close()
    test_writer.close()
    validation_writer.close()

def main():
    processData(TEST_PERCENTAGE, VALIDATION_PERCENTAGE)

if __name__ == '__main__':
    main()

inference.py

# -*- coding: UTF-8 -*-
#Author：Yinli

import tensorflow as tf
import tensorflow.contrib.slim as slim

'''
定义神经网络的前向传播过程
'''

#image_size是图片的尺寸，num_channels是图片数据的深度
#一般黑白是1，彩色是3，num_labels是要分类的个数
IMAGE_SIZE = 300
NUM_CHANNELS = 3
NUM_LABELS = 5

#第一层卷积层的深度和长宽
CONV1_DEEP = 32
CONV1_SIZE = 5

#第二层卷积层的深度和长宽
CONV2_DEEP = 64
CONV2_SIZE = 5

#全连接层的节点数
FC_SIZE = 500


def inference(inputs):
    inputs = tf.reshape(inputs, (-1, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
    net = slim.conv2d(inputs, CONV1_DEEP, [CONV1_SIZE, CONV1_SIZE], padding='SAME', scope='layer1_conv')
    net = slim.max_pool2d(net, 2, stride=2, scope='layer2-max-pool')
    net = slim.conv2d(net, CONV2_DEEP, [CONV2_SIZE,CONV2_SIZE], padding='SAME', scope='layer3-conv')
    net = slim.max_pool2d(net, 2, stride=2, scope='layer4-max-pool')
    net = slim.flatten(net, scope='flatten')
    net = slim.fully_connected(net, FC_SIZE, scope='layer5')
    net = slim.dropout(net,0.5,scope='dropout')
    net = slim.fully_connected(net, NUM_LABELS, scope='output')
    return net

processData.py

# -*- coding: UTF-8 -*-
#Author：Yinli

import tensorflow as tf

'''
处理图像原始数据
'''

def decodeImage(image_raw_data):
    # 解码
    image = tf.image.decode_jpeg(image_raw_data)
    # 如果图片格式不是float32则转为float32
    if image.dtype != tf.float32:
        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    # 将图片源数据转为300*300
    image = tf.image.resize_images(image, [300, 300])
    image.set_shape([300,300,3])

    return image

train.py

# -*- coding: UTF-8 -*-
# Author：Yinli

import tensorflow as tf
import processData
from inference import inference
import datetime

'''
神经网络训练
'''

# 用正则表达式获取训练数据文件数组
input_train_files = tf.train.match_filenames_once("./output_train_*")

# 构造输入文件队列
input_train_queue = tf.train.string_input_producer(input_train_files, shuffle=False)

# 定义训练所用参数
LEARNING_RATE = 0.0001
STEPS = 500
N_CLASSES = 5

# 定义队列相关参数以及预处理线程数
min_after_dequeue = 500
batch_size = 10
capacity = min_after_dequeue + 3 * batch_size
num_threads = 1

def main():

    # 获取一个reader
    reader = tf.TFRecordReader()

    # 从文件队列中解析数据
    _, train_example = reader.read(input_train_queue)
    features = tf.parse_single_example(
        train_example,
        features={
            'image_raw':tf.FixedLenFeature([], tf.string),
            'pixels': tf.FixedLenFeature([], tf.int64),
            'label': tf.FixedLenFeature([], tf.int64)
        }
    )
    image, label = features['image_raw'], features['label']
    # 将图像原始数据进行预处理
    decoded_image = processData.decodeImage(image)

    # 将处理后的数据和标签数据通过shuffle_batch整理成神经网络训练所需要的batch
    image_batch, label_batch = tf.train.shuffle_batch([decoded_image,label], num_threads=num_threads,
                                                      batch_size=batch_size,capacity=capacity,
                                                      min_after_dequeue=min_after_dequeue)

    # 定义神经网络前向传播
    logit = inference(image_batch)
    # 定义损失
    tf.losses.softmax_cross_entropy(tf.one_hot(label_batch, N_CLASSES), logit, weights=1.0)
    loss = tf.losses.get_total_loss()
    # 定义训练过程
    train_step = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)

    with tf.Session() as sess:
        # 初始化
        init = tf.global_variables_initializer(),tf.local_variables_initializer()
        sess.run(init)

        # 启动线程
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # 开始训练
        start = datetime.datetime.now()
        for i in range(STEPS):
            if ((i+1)%100 == 0):
                print("正在进行第%d轮训练..."%(i+1))
            sess.run(train_step)
        # 停止所有线程
        coord.request_stop()
        coord.join()
        end = datetime.datetime.now()

        # 打印神经网络训练时间
        print("预处理线程数为", num_threads)
        print("500轮训练一共花了",(end-start).seconds, "秒")

if __name__ == '__main__':
    main()

运行结果

这里分别采用了一个线程和三个线程来进行预处理，并进行了500次训练所用的时间的对比，这两种方式均为一个reader读取数据，运行结果如下所示：

一个线程进行预处理

三个线程进行预处理

从结果可以看到采用多线程来进行数据的预处理可以有效地减少神经网络训练的瓶颈时间，但对于复杂的神经网络应该效果不大，因为复杂的神经网络的时间主要消耗在训练过程而不是数据预处理过程，但是对于数据量大的问题，采用多线程来进行数据的预处理是一个可行的优化方法。