划分数据集(python)

2018-10-28  本文已影响0人  huim

将特定文件路径下的图片数据划分为训练集、验证集和测试集。

import glob
import os.path
import random

# 数据路径
INPUT_DATA = './flower_photos'

 # 按一定比例划分数据集
def create_image_lists(testing_percentage, validation_percentage):
  result = {}
  sub_dirs = [x[0] for x in os.walk(INPUT_DATA)]
  is_root_dir = True
  for sub_dir in sub_dirs:
      if is_root_dir:
          is_root_dir = False
          continue
      
    # 图片的扩展名 
    extensions = ['jpeg', 'jpg', 'JPG', 'JPEG']
    file_list = []
    dir_name = os.path.basename(sub_dir)
    for extension in extensions:
        file_glob = os.path.join(INPUT_DATA, dir_name, '*.'+extension)
        file_list.extend(glob.glob(file_glob))
    if not file_list:
        continue

    label_name = dir_name.lower()
    training_images = []
    testing_images = []
    validation_images = []
    for file_name in file_list:
        base_name = os.path.basename(file_name)
        chance = np.random.randint(100)
        if chance < validation_percentage:
            validation_images.append(base_name)
        elif chance < (testing_percentage+validation_percentage):
            testing_images.append(base_name)
        else:
            training_images.append(base_name)

    result[label_name] = {
        'dir': dir_name,
        'training': training_images,
        'testing': testing_images,
        'validation': validation_images
    }

return result

获取图片样本的完整路径。

# 图片路径
def get_image_path(image_lists, image_dir, label_name, index, category):
    label_lists = image_lists[label_name]
    category_list = label_lists[category]
    mod_index = index % len(category_list)
    base_name = category_list[mod_index]
    sub_dir = label_lists['dir']
    full_path = os.path.join(image_dir, sub_dir, base_name)
return full_path
上一篇 下一篇

猜你喜欢

热点阅读