Neon指令应用---图像腐蚀和膨胀加速

2022-05-05  本文已影响0人  教训小磊

最近在项目开发的过程中需要使用到形态学中的腐蚀膨胀算法,以获得图像中的边缘信息。腐蚀和膨胀原始的C代码运行一帧320x180x1的图像需要8ms左右,经过neon指令加速后的运行时间只需0.5ms,差不多是16倍的加速。
ps:由于项目的需要,我这边使用的是5x1的卷积核,用来提取图像中的横条信息,所以下面的加速方法并不适合卷积核是3x3或5x5这种的情况。同时为了方便加速,去除了算法开头的一些判断,所以结果会有几个像素的差异。

1.腐蚀和膨胀原始C代码

#include "stdafx.h"
#include "cv.h"
#include "highgui.h"
#include "stdio.h"
#include "stdlib.h"
#include "vector"
//腐蚀
void MorphErosion(unsigned char* src, unsigned char* dst, int width, int height, int strutWidth, int structHeight)
{
    if (width - strutWidth < 0 && height - structHeight < 0)return;
    int midY = (structHeight + 1) / 2 - 1;
    unsigned char val = 255;
    for (int i = midY; i < height - midY; i++)
    {
        for (int j = midY; j < width - midY; j++)
       {
                for (int n = 0; n < strutWidth; n++)
                {
                    val &= src[i * width + j + n];
                }
            dst[i * width + j] = val;
            val = 255;
        }
    }
}
//膨胀
void MorphDilition(unsigned char* src, unsigned char* dst, int width, int height, int strutWidth, int structHeight)
{
    if (width - strutWidth < 0 && height - structHeight < 0)return;
    int midY = (structHeight + 1) / 2 - 1;
    unsigned char val = 0;
    for (int i = midY; i < height - midY; i++)
    {
        for (int j = midY; j < width - midY; j++)
        {
            for (int n = 0; n < strutWidth; n++)
            {
                val |= src[i * width + j + n];
            }
            dst[i * width + j] = val;
            val = 0;
        }
   }
}
void MorphOpen(unsigned char* src, unsigned char* tmp, int width, int height, int strutWidth, int structHeight)
{
    MorphErosion(src, tmp, width, height, strutWidth, structHeight);
    MorphDilition(tmp, tmp, width, height, strutWidth, structHeight);
}
int main()
{
    int iRet = 0;
    IplImage * src, *dst;
    src = cvLoadImage("./sad_pic/6_R.jpg", 0);
    if (src == NULL)
    {
        printf("open image failed\n");
        exit(1);
    }
    IplImage* s = cvCreateImage(cvGetSize(src), IPL_DEPTH_8U, 1);
    cvThreshold(src, src, 180, 255, CV_THRESH_BINARY);
    dst = cvCloneImage(src);
    MorphOpen((unsigned char*)src->imageData, (unsigned char*)dst->imageData, src->widthStep, src->height, 5, 1);
    cvShowImage("src", src);
    cvShowImage("dst", dst);
    cvWaitKey(0);
    cvReleaseImage(&src);
    cvReleaseImage(&dst);
    return iRet;
}

2.腐蚀和膨胀Neon指令加速代码

具体思路:去除了卷积核的概念,直接通过对每一排相邻的5个元素进行and或or操作,所以代码中设置了5个相邻的寄存器p0~p4。

//腐蚀
void MorphErosion(unsigned char* src, unsigned char* dst, int width, int height, int strutWidth, int structHeight)
{
    uint8x16_t p0_8x16;
    uint8x16_t p1_8x16;
    uint8x16_t p2_8x16;
    uint8x16_t p3_8x16;
    uint8x16_t p4_8x16;
    uint8x16_t val_8x16;

    int erosion_loop = width / 16;
    if (width - strutWidth < 0 && height - structHeight < 0)return;
    for (int i = 0; i < height; i++)
    {
        int steph = i * width;
        for (int k = 0; k<erosion_loop; k++)
        {
            int stepw = 16 * k;
            p0_8x16 = vld1q_u8(src + steph + stepw);
            p1_8x16 = vld1q_u8(src + steph + stepw + 1);
            p2_8x16 = vld1q_u8(src + steph + stepw + 2);
            p3_8x16 = vld1q_u8(src + steph + stepw + 3);
            p4_8x16 = vld1q_u8(src + steph + stepw + 4);
            val_8x16 = vandq_u8(vandq_u8(vandq_u8(vandq_u8(p0_8x16, p1_8x16), p2_8x16), p3_8x16), p4_8x16);
            vst1q_u8(dst + steph + stepw, val_8x16);
        }
    }
}
//膨胀
void MorphDilition(unsigned char* src, unsigned char* dst, int width, int height, int strutWidth, int structHeight)
{
    uint8x16_t p0_8x16;
    uint8x16_t p1_8x16;
    uint8x16_t p2_8x16;
    uint8x16_t p3_8x16;
    uint8x16_t p4_8x16;
    uint8x16_t val_8x16;

    int dilition_loop = width / 16;
    if (width - strutWidth < 0 && height - structHeight < 0)return;
    for (int i = 0; i < height; i++)
    {
        int steph = i * width;
        for (int k = 0; k<dilition_loop; k++)
        {
            int stepw = 16 * k;
            p0_8x16 = vld1q_u8(src + steph + stepw);
            p1_8x16 = vld1q_u8(src + steph + stepw + 1);
            p2_8x16 = vld1q_u8(src + steph + stepw + 2);
            p3_8x16 = vld1q_u8(src + steph + stepw + 3);
            p4_8x16 = vld1q_u8(src + steph + stepw + 4);
            val_8x16 = vorrq_u8(vorrq_u8(vorrq_u8(vorrq_u8(p0_8x16, p1_8x16), p2_8x16), p3_8x16), p4_8x16);
            vst1q_u8(dst + steph + stepw, val_8x16);
        }
    }
}
效果图
上一篇 下一篇

猜你喜欢

热点阅读