MTCNN网络之P网络——pytorch代码实现

2023-06-17 本文已影响0人小黄不头秃

MTCNN网络是多任务级联卷积神经网络，基于级联的特定目标检测器，在人脸识别中有着出色的表现。由P、R、O三个网络构成。常用于目标检测的算法还有SSD（单激多发探测器）、RCNN、YOLO等

如果对MTCNN的基础知识不太清楚，可以参考我的文章：

一、网络结构

MTCNN由三个子网络构成：P网络、R网络、O网络
P网络的输入是12*12，R网络输入是24*24，O网络输入是48*48。P网络注重从大量数据中筛选有效信息，R网络对P网络的筛选结果做二次筛选，O网络注重识别的精度。

（1）P网络（3层）

图像金字塔和滑动窗口把一张图片分割成了很多张图片，所以P网络面对的数据量是最大的。

第一层：3*3卷积核，2*2最大池化层，输出特征图为：5*5*10
第二层：3*3卷积核，输出特征图为：3*3*16
第三层：3*3卷积核，输出特征图为：1*1*5（这里我们仅需要置信度和四个坐标）

代码实现：

"""
P网络结构
"""
import torch 
import torch.nn as nn
from PIL import Image
import torchvision

class PNet(nn.Module):
    def __init__(self):
        super(PNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(3,10,3,1,bias=False),nn.BatchNorm2d(10),nn.ReLU(), nn.MaxPool2d(2), # [1, 10, 5, 5]
            nn.Conv2d(10,16,3,1,bias=False),nn.BatchNorm2d(16),nn.ReLU(), # [1, 16, 3, 3]
            nn.Conv2d(16,32,3,1), # [1, 5, 1, 1]
        )

        self.cond = nn.Conv2d(32,1,1,1)
        self.offset = nn.Conv2d(32,4,1,1)
 
    def forward(self, x):
        y = self.layers(x)
        # print()
        category = self.cond(y)
        offset = self.offset(y)
        category = torch.sigmoid(category.float())
        return category, offset

if __name__ == "__main__":
    net = PNet()
    net.load_state_dict(torch.load("./param/pnet.pt"))
    x = Image.open(r"D:\document\DL\MTCNN_face_detect\dataset\12\positive\0.jpg")
    x = torchvision.transforms.ToTensor()(x).unsqueeze_(0)
    y1, y2 = net(x)
    print(y1)
    print(y2)

P网络数据准备：

数据准备时应注意：

我们在准备数据集的时候，一定要让负样本的数量远多于正样本的数量和部分样本的数量，
IOU阈值不能设置过小，也不能设置过大，否则会导致损失降不下去，准确率不够高。
数据量一定要够，数据量不够的话，一些不是人脸的物体容易被误识别
如果缺失亚洲人人脸，可以手动加入一些数据

"""
wider-face 生成12*12的训练数据
"""
import os
import sys
sys.path.append("./")
# print(os.getcwd())
import cv2
from PIL import Image
import numpy as np
from tqdm import tqdm
from utils.utils import IOU2 
import torch

TARGET_PATH = "./dataset/"
IMG_PATH = "./dataset/WIDER_train/images/"
DST_PATH = "./dataset/"
face_size = 12

label_file_path = TARGET_PATH + "wider_face_train.txt"

# 首先先生成对应的文件夹
if not os.path.exists(DST_PATH+f"{face_size}"):
    os.mkdir(DST_PATH+"12")

# pos，part,neg裁剪图片放置位置
pos_save_dir = os.path.join(DST_PATH, f'{face_size}/positive')
part_save_dir = os.path.join(DST_PATH, f'{face_size}/part')
neg_save_dir = os.path.join(DST_PATH, f'{face_size}/negative')
# PNet数据地址
save_dir = os.path.join(DST_PATH, f'{face_size}/')

# 创建文件夹
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
if not os.path.exists(pos_save_dir):
    os.mkdir(pos_save_dir)
if not os.path.exists(part_save_dir):
    os.mkdir(part_save_dir)
if not os.path.exists(neg_save_dir):
    os.mkdir(neg_save_dir)

# 生成后的数据列表文件
positive_file = open(os.path.join(save_dir, 'positive.txt'), 'w')
negative_file = open(os.path.join(save_dir, 'negative.txt'), 'w')
part_file = open(os.path.join(save_dir, 'part.txt'), 'w')

# 原数据集的列表文件
with open(label_file_path, 'r') as f:
    annotations = f.readlines()
num = len(annotations)
print('总共的图片数： %d' % num)

# 记录pos,neg,part三类生成数
positive_counter = 0
negative_counter = 0
part_counter = 0

# 记录读取的图片数
idx = 0
for anno in tqdm(annotations):
    anno = anno.strip().split(' ')
    img_name = anno[0]
    img_path = IMG_PATH+img_name+".jpg"
    # 获取人脸的box所有坐标
    box = list(map(float, anno[1:]))
    # 把所有box的坐标按照4分割，里面是左上角和右下角的信息
    boxes = np.array(box, dtype=np.float32).reshape(-1, 4)
    w = box[2] - box[0]
    h = box[3] - box[1]
    
    # 已经获取了坐标信息下面对图像进行操作
    with Image.open(img_path) as img:
        idx += 1
        img_w, img_h = img.size

        # 过滤字段
        if max(w,h)<40 or box[0]<0 or box[1]<0 or w<0 or h<0:  continue 
        # 标注不标准，给框适当的偏移量
        x1 = int(box[0] + w*0.12)
        y1 = int(box[1] + h*0.1)
        x2 = int(box[0] + w*0.9)
        y2 = int(box[1] + h*0.85)
        w = int(x2-x1)
        h = int(y2-y1)

        boxes = [[x1, y1, x2, y2]]
        cx = x1 + w/2
        cy = y1 + h/2

        # # 查看框是否合适
        # img = cv2.imread(img_path)
        # img = cv2.rectangle(img, (x1,y1), (x2,y2),(0,0,255),2)
        # cv2.imshow("img",img)
        # cv2.waitKey(0)

        # 让正样本和部分样本随着图片中心进行随机偏移
        for _ in range(5):
            w_ = np.random.randint(-w*0.2, w*0.2)
            h_ = np.random.randint(-h*0.2, h*0.2)
            cx_ = cx + w_
            cy_ = cy + h_

            side_len = np.random.randint(int(min(w,h)*0.8), np.ceil(max(w,h)*1.25))
            # print("asdfaadscsdc  ",side_len)
            x1_ = np.max(cx_-side_len/2, 0)
            y1_ = np.max(cy_-side_len/2, 0)
            x2_ = x1_ + side_len
            y2_ = y1_ + side_len

            crop_box = np.array([x1_, y1_, x2_, y2_])

            # img2 = cv2.imread(img_path)
            # img2 = cv2.rectangle(img2,(int(x1_), int(y1_)), (int(x2_), int(y2_)),(0,0,255),2)
            # img2 = cv2.rectangle(img2, (x1,y1), (x2,y2),(0,255,0),2)
            # cv2.imshow("img", img2)
            # cv2.waitKey(0)
            # exit()

            # 计算坐标偏移量
            offset_x1 = (x1 - x1_) / side_len # ▲δ = (x1-x1_)/side_len
            offset_y1 = (y1 - y1_) / side_len
            offset_x2 = (x2 - x2_) / side_len
            offset_y2 = (y2 - y2_) / side_len

            face_crop = img.crop(crop_box)
            face_resize = face_crop.resize((face_size, face_size), Image.Resampling.LANCZOS)

            iou = IOU2(torch.tensor(boxes), torch.tensor(crop_box)).item()

            if iou > 0.65: # 正样本
                positive_file.write(f"positive/{ positive_counter}.jpg 1 {offset_x1} {offset_y1} {offset_x2} {offset_y2}\n")
                positive_file.flush()
                face_resize.save(pos_save_dir+f"/{positive_counter}.jpg")
                positive_counter += 1
            elif iou>0.4: # 部分样本
                part_file.write(f"part/{ part_counter}.jpg 2 {offset_x1} {offset_y1} {offset_x2} {offset_y2}\n")
                part_file.flush()
                face_resize.save(part_save_dir+f"/{ part_counter}.jpg")
                part_counter += 1
            # elif iou < 0.1: # 负样本
            #     negative_file.write(f"negative/{ negative_counter}.jpg 0 0 0 0 0\n")
            #     negative_file.flush()
            #     face_resize.save(neg_save_dir+f"/{ negative_counter}.jpg")
            #     negative_counter += 1
        
        _boxes = torch.tensor(boxes[0])

        for _ in range(5): # 这里是为了能够生成足够的负样本
            side_len = np.random.randint(face_size, min(img_w, img_h)/2)
            x_ = np.random.randint(0,img_w - side_len)
            y_ = np.random.randint(0,img_h - side_len)
            crop_box = np.array([x_, y_, x_+side_len, y_+side_len])

            if IOU2(torch.tensor(crop_box),_boxes).item() < 0.1:
                face_crop = img.crop(crop_box)
                face_resize = face_crop.resize((face_size, face_size), Image.Resampling.LANCZOS)
                
                negative_file.write(f"negative/{ negative_counter}.jpg 0 0 0 0 0\n")
                negative_file.flush()
                face_resize.save(neg_save_dir+f"/{ negative_counter}.jpg")
                negative_counter += 1

P网络训练代码实现：

import torch 
import os 
import sys
sys.path.append("./")
import torch.nn as nn
from models.PNet import PNet
from utils.trainer import Trainer 
from utils.data import MTCNN_Dataset
from torch.utils.data import DataLoader, Dataset 

model_save_path = "./param/pnet.pt"
data = MTCNN_Dataset("./dataset/12")
data_loader = DataLoader(data, shuffle=True, batch_size=100)
net = PNet()

trainer = Trainer(net, model_save_path,data_loader,50)
trainer.train()

训练器和工具类

import torch 
import torch.nn as nn
import os 
import sys
sys.path.append("./")
from utils.utils import accuracy

class Trainer():
    def __init__(self,net, param_path, data_loader, epoch):
        self.net = net
        self.param_path = param_path
        self.data_loader = data_loader 
        self.epoch = epoch
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        # self.classLoss = nn.BCELoss(reduction="mean")
        self.offsetLoss = nn.MSELoss(reduction="mean") 
        # self.optim = torch.optim.Adam(self.net.parameters(),lr=0.001,weight_decay=0.0001)
        self.optim = torch.optim.SGD(self.net.parameters(),lr=1e-5,weight_decay=0.0001) # 使用SGD进行微调
        self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optim, milestones=[6, 14, 20], gamma=0.1)

    def train(self):
        if os.path.exists(self.param_path):
            self.net.load_state_dict(torch.load(self.param_path))
        self.net.to(self.device)
        self.net.train()

        for e in range(self.epoch):
            for b_i, (img, cat, offset) in enumerate(self.data_loader):
                if self.device == "cuda":
                    img = img.to(self.device)
                    cat = cat.to(self.device)
                    offset = offset.to(self.device)
                cat_out, offset_out = self.net(img)
                cat_out = cat_out.reshape(-1,1) # [1024,1]
                offset_out = offset_out.reshape(-1, 4) # [1024, 4]

                # 在计算损失的时候，置信度只需要正样本和负样本
                # 偏移量只需要部分样本和正样本
                category_mask = cat < 2
                category_select = torch.masked_select(cat, category_mask)
                category_out_select = torch.masked_select(cat_out, category_mask)
                pred = torch.clamp(category_out_select, min=1e-7, max=1-1e-7)
                cls_loss = - category_select * torch.log(pred) - (1-category_select) * torch.log(1-pred)
                cls_loss = torch.mean(cls_loss)
                # cls_loss = self.classLoss(category_select, category_out_select) # 这里会出现nan的大坑

                offset_mask = cat > 0 
                offset_index = torch.nonzero(offset_mask)[:,0]
                offset_select = offset[offset_index]
                offset_out_select = offset_out[offset_index]
                offset_loss = self.offsetLoss(offset_select, offset_out_select)
                loss = cls_loss+offset_loss

                # print(cls_loss, offset_loss)
                # 更新参数
                self.optim.zero_grad()
                loss.backward()
                self.optim.step()

                if b_i%100 == 0:
                    acc = accuracy(class_out=cat_out, label=cat)
                    print('Train epoch %d, batch %d, total_loss: %f, cls_loss: %f, box_loss: %f,'
                  'accuracy：%.2f' % (e, b_i, loss.item(), cls_loss.item(), offset_loss.item(), acc*100))
                    torch.save(self.net.state_dict(), self.param_path)

            self.scheduler.step()

import pickle
import shutil
import numpy as np
import random
import os
import cv2
from tqdm import tqdm
import torch

def IOU(box, boxes):
    """裁剪的box和图片所有人脸box的iou值
    参数：
      box：裁剪的box,当box维度为4时表示box左上右下坐标，维度为5时，最后一维为box的置信度
      boxes：图片所有人脸box,[n,4]
    返回值：
      iou值，[n,]
    """
    # box面积
    box_area = (box[2] - box[0] + 1) * (box[3] - box[1] + 1)
    # boxes面积,[n,]
    area = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1)
    # 重叠部分左上右下坐标
    xx1 = np.maximum(box[0], boxes[:, 0])
    yy1 = np.maximum(box[1], boxes[:, 1])
    xx2 = np.minimum(box[2], boxes[:, 2])
    yy2 = np.minimum(box[3], boxes[:, 3])

    # 重叠部分长宽
    w = np.maximum(0, xx2 - xx1 + 1)
    h = np.maximum(0, yy2 - yy1 + 1)
    # 重叠部分面积
    inter = w * h
    return inter / (box_area + area - inter + 1e-10)

def IOU2(box, other_boxes):
    #box: [x1,y1,x2,y2] 分别表示方框的左上角的点和右下角的点
    #other_boxs: N个box，多了一个维度(代表box的数量)
    if len(box.shape)==2: box = box.squeeze()
    if len(other_boxes.shape)==1: other_boxes = other_boxes.unsqueeze(dim=0)
    box_area = (box[2]-box[0])*(box[3]-box[1])
    other_boxes_area = (other_boxes[:,2]-other_boxes[:,0]) * (other_boxes[:,3]-other_boxes[:,1])

    #交集
    x1 = torch.max(box[0],other_boxes[:,0])
    y1 = torch.max(box[1],other_boxes[:,1])
    x2 = torch.min(box[2],other_boxes[:,2])
    y2 = torch.min(box[3],other_boxes[:,3])
    Min = torch.tensor([0])
    w,h = torch.max(Min,x2-x1),torch.max(Min,y2-y1) #如果没有相交的框，两者相减是负值，为防止此类事情出现
    # 交集的面积
    overlap_area = w*h

    # 交并比 = 交集 / 并集
    iou = overlap_area / (box_area+other_boxes_area-overlap_area)
    return iou

# 求训练时的准确率
def accuracy(class_out, label):
    # 查找neg 0 和pos 1所在的位置
    class_out = class_out.detach().cpu().numpy().round()
    label = label.detach().cpu().numpy()
    mask = class_out == label
    correct = np.sum(mask)
    return correct/len(class_out)

def IOU3(box, other_boxes, isMin=False):
    #box: [x1,y1,x2,y2] 分别表示方框的左上角的点和右下角的点
    #other_boxs: N个box，多了一个维度(代表box的数量)
    if len(box.shape)==2: box = box.squeeze()
    if len(other_boxes.shape)==1: other_boxes = other_boxes.unsqueeze(dim=0)
    box_area = (box[2]-box[0])*(box[3]-box[1])
    other_boxes_area = (other_boxes[:,2]-other_boxes[:,0]) * (other_boxes[:,3]-other_boxes[:,1])

    #交集
    x1 = torch.max(box[0],other_boxes[:,0])
    y1 = torch.max(box[1],other_boxes[:,1])
    x2 = torch.min(box[2],other_boxes[:,2])
    y2 = torch.min(box[3],other_boxes[:,3])
    Min = torch.tensor([0])
    w,h = torch.max(Min,x2-x1),torch.max(Min,y2-y1) #如果没有相交的框，两者相减是负值，为防止此类事情出现
    # 交集的面积
    overlap_area = w*h

    if isMin:
        # 交并比 = 交集 / 最小面积
        iou = overlap_area / torch.min(box_area,other_boxes_area)
    else:
        # 交并比 = 交集 / 并集
        iou = overlap_area / (box_area+other_boxes_area-overlap_area)
    return iou


def NMS(boxes, thre = 0.5, isMin=False):
    # boxes: [[置信度, x1,y1,x2,y2], [置信度, x1,y1,x2,y2], ...]
    # - 第一步：找出所有预测目标中得分最高的预测框作为基准框；
    # - 第二步：计算剩余的预测框与基准框的IOU，如果IOU>阈值t，则将这些预测框删除，因为，这些预测框与0.97的太相似了，所以是冗余框，需要抑制；
    # - 第三步：在去除冗余框后剩余预测框中，找到得分第二高的预测框0.95作为基准框，执行第二步操作；
    # - 第四步：重复上述过程，直至剩余预测框中的每个预测框都曾被用作过基准框。即剩余的预测框任意两个IOU均小于t，没有两个预测框过于相似，完成nms操作。
    res = []
    if len(boxes) == 0:
        return torch.tensor([])
    sort_boxes = boxes[boxes[:, 0].argsort(descending=True)] # 获得置信度降序排序的下标，并根据下标进行索引。

    # 防止传入错误的值
    if len(sort_boxes) == 0:
        print("ERROR: NO boxes! ")
        return []

    while len(sort_boxes) > 1:
        ref_box = sort_boxes[0]
        res.append(ref_box)

        other_boxes = sort_boxes[1:]
        sort_boxes = other_boxes[torch.where(IOU3(ref_box[1:], other_boxes[:,1:], isMin=isMin)<thre)]
    
    if len(sort_boxes)>0: res.append(sort_boxes[0])

    return torch.stack(res)

def convert2square(p_box):
    square_box = p_box.copy()
    if square_box.shape[0] == 0: return np.array([])
    h = p_box[:, 3] - p_box[:, 1]
    w = p_box[:, 2] - p_box[:, 0]
    maxside = np.maximum(h,w)
    square_box[:, 0] = p_box[:, 0] + w*0.5 - maxside*0.5
    square_box[:, 1] = p_box[:, 1] + w*0.5 - maxside*0.5
    square_box[:, 2] = square_box[:, 0] + maxside
    square_box[:, 3] = square_box[:, 1] + maxside
    return square_box

本文代码仅供参考，具体代码可参照：Pytorch-MTCNN: 基于Pytorch实现的MTCNN模型，人脸检测，人脸关键点检测。 (gitee.com)