MTCNN网络之P网络——pytorch代码实现
2023-06-17 本文已影响0人
小黄不头秃
MTCNN网络是多任务级联卷积神经网络,基于级联的特定目标检测器,在人脸识别中有着出色的表现。由P、R、O三个网络构成。常用于目标检测的算法还有SSD(单激多发探测器)、RCNN、YOLO等
如果对MTCNN的基础知识不太清楚,可以参考我的文章:
一、网络结构
MTCNN由三个子网络构成:P网络、R网络、O网络
P网络的输入是12*12,R网络输入是24*24,O网络输入是48*48。P网络注重从大量数据中筛选有效信息,R网络对P网络的筛选结果做二次筛选,O网络注重识别的精度。
(1)P网络(3层)
图像金字塔和滑动窗口把一张图片分割成了很多张图片,所以P网络面对的数据量是最大的。
- 第一层:3*3卷积核,2*2最大池化层,输出特征图为:5*5*10
- 第二层:3*3卷积核,输出特征图为:3*3*16
- 第三层:3*3卷积核,输出特征图为:1*1*5(这里我们仅需要置信度和四个坐标)
代码实现:
"""
P网络结构
"""
import torch
import torch.nn as nn
from PIL import Image
import torchvision
class PNet(nn.Module):
def __init__(self):
super(PNet, self).__init__()
self.layers = nn.Sequential(
nn.Conv2d(3,10,3,1,bias=False),nn.BatchNorm2d(10),nn.ReLU(), nn.MaxPool2d(2), # [1, 10, 5, 5]
nn.Conv2d(10,16,3,1,bias=False),nn.BatchNorm2d(16),nn.ReLU(), # [1, 16, 3, 3]
nn.Conv2d(16,32,3,1), # [1, 5, 1, 1]
)
self.cond = nn.Conv2d(32,1,1,1)
self.offset = nn.Conv2d(32,4,1,1)
def forward(self, x):
y = self.layers(x)
# print()
category = self.cond(y)
offset = self.offset(y)
category = torch.sigmoid(category.float())
return category, offset
if __name__ == "__main__":
net = PNet()
net.load_state_dict(torch.load("./param/pnet.pt"))
x = Image.open(r"D:\document\DL\MTCNN_face_detect\dataset\12\positive\0.jpg")
x = torchvision.transforms.ToTensor()(x).unsqueeze_(0)
y1, y2 = net(x)
print(y1)
print(y2)
P网络数据准备:
数据准备时应注意:
- 我们在准备数据集的时候,一定要让负样本的数量远多于正样本的数量和部分样本的数量,
- IOU阈值不能设置过小,也不能设置过大,否则会导致损失降不下去,准确率不够高。
- 数据量一定要够,数据量不够的话,一些不是人脸的物体容易被误识别
- 如果缺失亚洲人人脸,可以手动加入一些数据
"""
wider-face 生成12*12的训练数据
"""
import os
import sys
sys.path.append("./")
# print(os.getcwd())
import cv2
from PIL import Image
import numpy as np
from tqdm import tqdm
from utils.utils import IOU2
import torch
TARGET_PATH = "./dataset/"
IMG_PATH = "./dataset/WIDER_train/images/"
DST_PATH = "./dataset/"
face_size = 12
label_file_path = TARGET_PATH + "wider_face_train.txt"
# 首先先生成对应的文件夹
if not os.path.exists(DST_PATH+f"{face_size}"):
os.mkdir(DST_PATH+"12")
# pos,part,neg裁剪图片放置位置
pos_save_dir = os.path.join(DST_PATH, f'{face_size}/positive')
part_save_dir = os.path.join(DST_PATH, f'{face_size}/part')
neg_save_dir = os.path.join(DST_PATH, f'{face_size}/negative')
# PNet数据地址
save_dir = os.path.join(DST_PATH, f'{face_size}/')
# 创建文件夹
if not os.path.exists(save_dir):
os.mkdir(save_dir)
if not os.path.exists(pos_save_dir):
os.mkdir(pos_save_dir)
if not os.path.exists(part_save_dir):
os.mkdir(part_save_dir)
if not os.path.exists(neg_save_dir):
os.mkdir(neg_save_dir)
# 生成后的数据列表文件
positive_file = open(os.path.join(save_dir, 'positive.txt'), 'w')
negative_file = open(os.path.join(save_dir, 'negative.txt'), 'w')
part_file = open(os.path.join(save_dir, 'part.txt'), 'w')
# 原数据集的列表文件
with open(label_file_path, 'r') as f:
annotations = f.readlines()
num = len(annotations)
print('总共的图片数: %d' % num)
# 记录pos,neg,part三类生成数
positive_counter = 0
negative_counter = 0
part_counter = 0
# 记录读取的图片数
idx = 0
for anno in tqdm(annotations):
anno = anno.strip().split(' ')
img_name = anno[0]
img_path = IMG_PATH+img_name+".jpg"
# 获取人脸的box所有坐标
box = list(map(float, anno[1:]))
# 把所有box的坐标按照4分割,里面是左上角和右下角的信息
boxes = np.array(box, dtype=np.float32).reshape(-1, 4)
w = box[2] - box[0]
h = box[3] - box[1]
# 已经获取了坐标信息下面对图像进行操作
with Image.open(img_path) as img:
idx += 1
img_w, img_h = img.size
# 过滤字段
if max(w,h)<40 or box[0]<0 or box[1]<0 or w<0 or h<0: continue
# 标注不标准,给框适当的偏移量
x1 = int(box[0] + w*0.12)
y1 = int(box[1] + h*0.1)
x2 = int(box[0] + w*0.9)
y2 = int(box[1] + h*0.85)
w = int(x2-x1)
h = int(y2-y1)
boxes = [[x1, y1, x2, y2]]
cx = x1 + w/2
cy = y1 + h/2
# # 查看框是否合适
# img = cv2.imread(img_path)
# img = cv2.rectangle(img, (x1,y1), (x2,y2),(0,0,255),2)
# cv2.imshow("img",img)
# cv2.waitKey(0)
# 让正样本和部分样本随着图片中心进行随机偏移
for _ in range(5):
w_ = np.random.randint(-w*0.2, w*0.2)
h_ = np.random.randint(-h*0.2, h*0.2)
cx_ = cx + w_
cy_ = cy + h_
side_len = np.random.randint(int(min(w,h)*0.8), np.ceil(max(w,h)*1.25))
# print("asdfaadscsdc ",side_len)
x1_ = np.max(cx_-side_len/2, 0)
y1_ = np.max(cy_-side_len/2, 0)
x2_ = x1_ + side_len
y2_ = y1_ + side_len
crop_box = np.array([x1_, y1_, x2_, y2_])
# img2 = cv2.imread(img_path)
# img2 = cv2.rectangle(img2,(int(x1_), int(y1_)), (int(x2_), int(y2_)),(0,0,255),2)
# img2 = cv2.rectangle(img2, (x1,y1), (x2,y2),(0,255,0),2)
# cv2.imshow("img", img2)
# cv2.waitKey(0)
# exit()
# 计算坐标偏移量
offset_x1 = (x1 - x1_) / side_len # ▲δ = (x1-x1_)/side_len
offset_y1 = (y1 - y1_) / side_len
offset_x2 = (x2 - x2_) / side_len
offset_y2 = (y2 - y2_) / side_len
face_crop = img.crop(crop_box)
face_resize = face_crop.resize((face_size, face_size), Image.Resampling.LANCZOS)
iou = IOU2(torch.tensor(boxes), torch.tensor(crop_box)).item()
if iou > 0.65: # 正样本
positive_file.write(f"positive/{ positive_counter}.jpg 1 {offset_x1} {offset_y1} {offset_x2} {offset_y2}\n")
positive_file.flush()
face_resize.save(pos_save_dir+f"/{positive_counter}.jpg")
positive_counter += 1
elif iou>0.4: # 部分样本
part_file.write(f"part/{ part_counter}.jpg 2 {offset_x1} {offset_y1} {offset_x2} {offset_y2}\n")
part_file.flush()
face_resize.save(part_save_dir+f"/{ part_counter}.jpg")
part_counter += 1
# elif iou < 0.1: # 负样本
# negative_file.write(f"negative/{ negative_counter}.jpg 0 0 0 0 0\n")
# negative_file.flush()
# face_resize.save(neg_save_dir+f"/{ negative_counter}.jpg")
# negative_counter += 1
_boxes = torch.tensor(boxes[0])
for _ in range(5): # 这里是为了能够生成足够的负样本
side_len = np.random.randint(face_size, min(img_w, img_h)/2)
x_ = np.random.randint(0,img_w - side_len)
y_ = np.random.randint(0,img_h - side_len)
crop_box = np.array([x_, y_, x_+side_len, y_+side_len])
if IOU2(torch.tensor(crop_box),_boxes).item() < 0.1:
face_crop = img.crop(crop_box)
face_resize = face_crop.resize((face_size, face_size), Image.Resampling.LANCZOS)
negative_file.write(f"negative/{ negative_counter}.jpg 0 0 0 0 0\n")
negative_file.flush()
face_resize.save(neg_save_dir+f"/{ negative_counter}.jpg")
negative_counter += 1
P网络训练代码实现:
import torch
import os
import sys
sys.path.append("./")
import torch.nn as nn
from models.PNet import PNet
from utils.trainer import Trainer
from utils.data import MTCNN_Dataset
from torch.utils.data import DataLoader, Dataset
model_save_path = "./param/pnet.pt"
data = MTCNN_Dataset("./dataset/12")
data_loader = DataLoader(data, shuffle=True, batch_size=100)
net = PNet()
trainer = Trainer(net, model_save_path,data_loader,50)
trainer.train()
训练器和工具类
import torch
import torch.nn as nn
import os
import sys
sys.path.append("./")
from utils.utils import accuracy
class Trainer():
def __init__(self,net, param_path, data_loader, epoch):
self.net = net
self.param_path = param_path
self.data_loader = data_loader
self.epoch = epoch
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# self.classLoss = nn.BCELoss(reduction="mean")
self.offsetLoss = nn.MSELoss(reduction="mean")
# self.optim = torch.optim.Adam(self.net.parameters(),lr=0.001,weight_decay=0.0001)
self.optim = torch.optim.SGD(self.net.parameters(),lr=1e-5,weight_decay=0.0001) # 使用SGD进行微调
self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optim, milestones=[6, 14, 20], gamma=0.1)
def train(self):
if os.path.exists(self.param_path):
self.net.load_state_dict(torch.load(self.param_path))
self.net.to(self.device)
self.net.train()
for e in range(self.epoch):
for b_i, (img, cat, offset) in enumerate(self.data_loader):
if self.device == "cuda":
img = img.to(self.device)
cat = cat.to(self.device)
offset = offset.to(self.device)
cat_out, offset_out = self.net(img)
cat_out = cat_out.reshape(-1,1) # [1024,1]
offset_out = offset_out.reshape(-1, 4) # [1024, 4]
# 在计算损失的时候,置信度只需要正样本和负样本
# 偏移量只需要部分样本和正样本
category_mask = cat < 2
category_select = torch.masked_select(cat, category_mask)
category_out_select = torch.masked_select(cat_out, category_mask)
pred = torch.clamp(category_out_select, min=1e-7, max=1-1e-7)
cls_loss = - category_select * torch.log(pred) - (1-category_select) * torch.log(1-pred)
cls_loss = torch.mean(cls_loss)
# cls_loss = self.classLoss(category_select, category_out_select) # 这里会出现nan的大坑
offset_mask = cat > 0
offset_index = torch.nonzero(offset_mask)[:,0]
offset_select = offset[offset_index]
offset_out_select = offset_out[offset_index]
offset_loss = self.offsetLoss(offset_select, offset_out_select)
loss = cls_loss+offset_loss
# print(cls_loss, offset_loss)
# 更新参数
self.optim.zero_grad()
loss.backward()
self.optim.step()
if b_i%100 == 0:
acc = accuracy(class_out=cat_out, label=cat)
print('Train epoch %d, batch %d, total_loss: %f, cls_loss: %f, box_loss: %f,'
'accuracy:%.2f' % (e, b_i, loss.item(), cls_loss.item(), offset_loss.item(), acc*100))
torch.save(self.net.state_dict(), self.param_path)
self.scheduler.step()
import pickle
import shutil
import numpy as np
import random
import os
import cv2
from tqdm import tqdm
import torch
def IOU(box, boxes):
"""裁剪的box和图片所有人脸box的iou值
参数:
box:裁剪的box,当box维度为4时表示box左上右下坐标,维度为5时,最后一维为box的置信度
boxes:图片所有人脸box,[n,4]
返回值:
iou值,[n,]
"""
# box面积
box_area = (box[2] - box[0] + 1) * (box[3] - box[1] + 1)
# boxes面积,[n,]
area = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1)
# 重叠部分左上右下坐标
xx1 = np.maximum(box[0], boxes[:, 0])
yy1 = np.maximum(box[1], boxes[:, 1])
xx2 = np.minimum(box[2], boxes[:, 2])
yy2 = np.minimum(box[3], boxes[:, 3])
# 重叠部分长宽
w = np.maximum(0, xx2 - xx1 + 1)
h = np.maximum(0, yy2 - yy1 + 1)
# 重叠部分面积
inter = w * h
return inter / (box_area + area - inter + 1e-10)
def IOU2(box, other_boxes):
#box: [x1,y1,x2,y2] 分别表示方框的左上角的点和右下角的点
#other_boxs: N个box,多了一个维度(代表box的数量)
if len(box.shape)==2: box = box.squeeze()
if len(other_boxes.shape)==1: other_boxes = other_boxes.unsqueeze(dim=0)
box_area = (box[2]-box[0])*(box[3]-box[1])
other_boxes_area = (other_boxes[:,2]-other_boxes[:,0]) * (other_boxes[:,3]-other_boxes[:,1])
#交集
x1 = torch.max(box[0],other_boxes[:,0])
y1 = torch.max(box[1],other_boxes[:,1])
x2 = torch.min(box[2],other_boxes[:,2])
y2 = torch.min(box[3],other_boxes[:,3])
Min = torch.tensor([0])
w,h = torch.max(Min,x2-x1),torch.max(Min,y2-y1) #如果没有相交的框,两者相减是负值,为防止此类事情出现
# 交集的面积
overlap_area = w*h
# 交并比 = 交集 / 并集
iou = overlap_area / (box_area+other_boxes_area-overlap_area)
return iou
# 求训练时的准确率
def accuracy(class_out, label):
# 查找neg 0 和pos 1所在的位置
class_out = class_out.detach().cpu().numpy().round()
label = label.detach().cpu().numpy()
mask = class_out == label
correct = np.sum(mask)
return correct/len(class_out)
def IOU3(box, other_boxes, isMin=False):
#box: [x1,y1,x2,y2] 分别表示方框的左上角的点和右下角的点
#other_boxs: N个box,多了一个维度(代表box的数量)
if len(box.shape)==2: box = box.squeeze()
if len(other_boxes.shape)==1: other_boxes = other_boxes.unsqueeze(dim=0)
box_area = (box[2]-box[0])*(box[3]-box[1])
other_boxes_area = (other_boxes[:,2]-other_boxes[:,0]) * (other_boxes[:,3]-other_boxes[:,1])
#交集
x1 = torch.max(box[0],other_boxes[:,0])
y1 = torch.max(box[1],other_boxes[:,1])
x2 = torch.min(box[2],other_boxes[:,2])
y2 = torch.min(box[3],other_boxes[:,3])
Min = torch.tensor([0])
w,h = torch.max(Min,x2-x1),torch.max(Min,y2-y1) #如果没有相交的框,两者相减是负值,为防止此类事情出现
# 交集的面积
overlap_area = w*h
if isMin:
# 交并比 = 交集 / 最小面积
iou = overlap_area / torch.min(box_area,other_boxes_area)
else:
# 交并比 = 交集 / 并集
iou = overlap_area / (box_area+other_boxes_area-overlap_area)
return iou
def NMS(boxes, thre = 0.5, isMin=False):
# boxes: [[置信度, x1,y1,x2,y2], [置信度, x1,y1,x2,y2], ...]
# - 第一步:找出所有预测目标中得分最高的预测框作为基准框;
# - 第二步:计算剩余的预测框与基准框的IOU,如果IOU>阈值t,则将这些预测框删除,因为,这些预测框与0.97的太相似了,所以是冗余框,需要抑制;
# - 第三步:在去除冗余框后剩余预测框中,找到得分第二高的预测框0.95作为基准框,执行第二步操作;
# - 第四步:重复上述过程,直至剩余预测框中的每个预测框都曾被用作过基准框。即剩余的预测框任意两个IOU均小于t,没有两个预测框过于相似,完成nms操作。
res = []
if len(boxes) == 0:
return torch.tensor([])
sort_boxes = boxes[boxes[:, 0].argsort(descending=True)] # 获得置信度降序排序的下标,并根据下标进行索引。
# 防止传入错误的值
if len(sort_boxes) == 0:
print("ERROR: NO boxes! ")
return []
while len(sort_boxes) > 1:
ref_box = sort_boxes[0]
res.append(ref_box)
other_boxes = sort_boxes[1:]
sort_boxes = other_boxes[torch.where(IOU3(ref_box[1:], other_boxes[:,1:], isMin=isMin)<thre)]
if len(sort_boxes)>0: res.append(sort_boxes[0])
return torch.stack(res)
def convert2square(p_box):
square_box = p_box.copy()
if square_box.shape[0] == 0: return np.array([])
h = p_box[:, 3] - p_box[:, 1]
w = p_box[:, 2] - p_box[:, 0]
maxside = np.maximum(h,w)
square_box[:, 0] = p_box[:, 0] + w*0.5 - maxside*0.5
square_box[:, 1] = p_box[:, 1] + w*0.5 - maxside*0.5
square_box[:, 2] = square_box[:, 0] + maxside
square_box[:, 3] = square_box[:, 1] + maxside
return square_box
本文代码仅供参考,具体代码可参照:Pytorch-MTCNN: 基于Pytorch实现的MTCNN模型,人脸检测,人脸关键点检测。 (gitee.com)