void add_ref( const float* pSrc1, int src1Step, const float* pSrc2, int src2Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { for( i = 0; i < width; i++ ) { pDst[i] = pSrc1[i] + pSrc2[i]; } pSrc1 = pSrc1 + src1Step; pSrc2 = pSrc2 + src2Step; pDst = pDst + dstStep; } } void add_avx512( const float* pSrc1, int src1Step, const float* pSrc2, int src2Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h<height; h++ ) { i = 0; __m512 zmm0, zmm1; for(i=i; i < (width&(~15)); i+=16 ) { zmm0 = _mm512_loadu_ps(pSrc1+i ); zmm1 = _mm512_loadu_ps(pSrc2+i ); zmm0 = _mm512_add_ps(zmm0, zmm1); _mm512_storeu_ps (pDst+i, zmm0 ); } for(i=i; i < width; i++ ) { pDst[i] = pSrc1[i] + pSrc2[i]; } pSrc1 = pSrc1 + src1Step; pSrc2 = pSrc2 + src2Step; pDst = pDst + dstStep; } } __mmask16 len2mask[] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF }; void add_avx512_2( float* pSrc1, int src1Step, const float* pSrc2, int src2Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { i = 0; __m512 zmm0, zmm1; __mmask16 msk; for(i=i; i < (width&(~15)); i+=16 ) { zmm0 = _mm512_loadu_ps(pSrc1+i ); zmm1 = _mm512_loadu_ps(pSrc2+i ); zmm0 = _mm512_add_ps (zmm0, zmm1); _mm512_storeu_ps (pDst+i, zmm0 ); } msk = len2mask[width - i]; if(msk){ zmm0 = _mm512_mask_loadu_ps(_mm512_setzero_ps(),msk,pSrc1+i ); zmm1 = _mm512_mask_loadu_ps(_mm512_setzero_ps(),msk,pSrc2+i ); zmm0 = _mm512_add_ps (zmm0, zmm1); _mm512_mask_storeu_ps (pDst+i, msk, zmm0 ); } pSrc1 = pSrc1 + src1Step; pSrc2 = pSrc2 + src2Step; pDst = pDst + dstStep; } } void add_avx512_3(float* pSrc1, int src1Step, const float* pSrc2, int src2Step, float* pDst, int dstStep, int width, int height) { int h, i, t; for( h = 0; h < height; h++ ) { i = 0; __m512 zmm0, zmm1; __mmask16 msk; t = ((((int)pDst) & (63)) >> 2); msk = len2mask[t]; if(msk){ zmm0 = _mm512_mask_loadu_ps(_mm512_setzero_ps(),msk,pSrc1+i ); zmm1 = _mm512_mask_loadu_ps(_mm512_setzero_ps(),msk,pSrc2+i ); zmm0 = _mm512_add_ps (zmm0, zmm1); _mm512_mask_storeu_ps (pDst+i, msk, zmm0 ); } i += t; for(i=i; i < (width&(~15)); i+=16 ) { zmm0 = _mm512_loadu_ps(pSrc1+i ); zmm1 = _mm512_loadu_ps(pSrc2+i ); zmm0 = _mm512_add_ps (zmm0, zmm1); _mm512_storeu_ps (pDst+i, zmm0 ); } msk = len2mask[width - i]; if(msk){ zmm0 = _mm512_mask_loadu_ps(_mm512_setzero_ps(),msk,pSrc1+i ); zmm1 = _mm512_mask_loadu_ps(_mm512_setzero_ps(),msk,pSrc2+i ); zmm0 = _mm512_add_ps (zmm0, zmm1); _mm512_mask_storeu_ps (pDst+i, msk, zmm0 ); } pSrc1 = pSrc1 + src1Step; pSrc2 = pSrc2 + src2Step; pDst = pDst + dstStep; } } <source> </spoiler> add . , , . . . . 1.5 <spoiler title=" 1.5"> <source lang="C++"> #define min(a,b) ((a)<(b)?(a):(b)) void add_avx512_4( const float* pSrc1, int src1Step, const float* pSrc2, int src2Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { i = 0; __m512 zmm0, zmm1; __mmask16 msk; for(i=i; i < width; i+=16 ) { msk = len2mask[min(16, width - i)]; zmm0 = _mm512_mask_loadu_ps(_mm512_setzero_ps(),msk,pSrc1+i ); zmm1 = _mm512_mask_loadu_ps(_mm512_setzero_ps(),msk,pSrc2+i ); zmm0 = _mm512_add_ps (zmm0, zmm1); /*THE LONG PIPELINE HERE*/ _mm512_mask_storeu_ps (pDst+i, msk, zmm0 ); } pSrc1 = pSrc1 + src1Step; pSrc2 = pSrc2 + src2Step; pDst = pDst + dstStep; } } void morph_3x3_ref( const float* pSrc1, int src1Step, char* pMask, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { for(i=0; i < width; i++ ) { int x, y; char* pm = pMask; /*we assume that pMask is not zero total*/ float m = 3.402823466e+38f; float val = 0.0; for (y = 0; y < 3;y++){ for (x = 0; x < 3; x++){ if (*pm){ val = pSrc1[i + src1Step*y + x]; if (val < m) { m = val; } } pm++; } } pDst[i] = m; } pSrc1 = pSrc1 + src1Step; pDst = pDst + dstStep; } } void morph_3x3_avx512( const float* pSrc1, int src1Step, char* pMask, float* pDst, int dstStep, int width, int height) { int h, i; __mmask msk[9], tail_msk; for ( i = 0; i < 9; i++){ msk[i] = (!pMask[i]) ? 0 : 0xFFFF;//create load mask } tail_msk = len2mask[width & 15]; //tail mask for( h = 0; h < height; h++ ) { __m512 zmm0, zmmM; int x, y; i = 0; for(i=i; i < (width&(~15)); i+=16 ) { zmmM = _mm512_set1_ps(3.402823466e+38f); for (y = 0; y < 3;y++){ for (x = 0; x < 3; x++){ zmm0 = _mm512_mask_loadu_ps(zmmM, msk[3*y+x], &pSrc1[i + src1Step*y + x]); zmmM = _mm512_min_ps(zmm0, zmmM); } } _mm512_storeu_ps (pDst+i, zmmM ); } if(tail_msk) { zmmM = _mm512_set1_ps(3.402823466e+38f); for (y = 0; y < 3;y++){ for (x = 0; x < 3; x++){ zmm0=_mm512_mask_loadu_ps(zmmM,msk[3*y+x]&tail_msk,&pSrc1[i+src1Step*y+x]); zmmM=_mm512_min_ps(zmm0, zmmM); } } _mm512_mask_storeu_ps (pDst+i, tail_msk, zmmM ); } pSrc1 = pSrc1 + src1Step; pDst = pDst + dstStep; } } 
void rgb_ref( const float* pSrc1, int src1Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { for(i=0; i < width; i++ ) { pDst[3*i+0]=(0.412f*pSrc1[3*i]+0.357f*pSrc1[3*i+1]+0.180f*pSrc1[3*i+2]); pDst[3*i+1]=(0.212f*pSrc1[3*i]+0.715f*pSrc1[3*i+1]+0.072f*pSrc1[3*i+2]); pDst[3*i+2]=(0.019f*pSrc1[3*i]+0.119f*pSrc1[3*i+1]+0.950f*pSrc1[3*i+2]); if (pDst[3 * i + 2] < 0.0){ pDst[3 * i + 2] = 0.0; } if (pDst[3 * i + 2] > 1.0){ pDst[3 * i + 2] = 1.0; } } pSrc1 = pSrc1 + src1Step; pDst = pDst + dstStep; } } void rgb_avx512( const float* pSrc1, int src1Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { __m512 zmm0, zmm1, zmm2, zmm3; i = 0; for(i=i; i < (width&(~3)); i+=4 ) { zmm0 = _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), 0x7777, &pSrc1[3*i ]); zmm1 = _mm512_mul_ps(_mm512_set4_ps(0.0f, 0.019f, 0.212f, 0.412f), _mm512_shuffle_ps(zmm0, zmm0, 0x00)); zmm2 = _mm512_mul_ps(_mm512_set4_ps(0.0f, 0.119f, 0.715f, 0.357f), _mm512_shuffle_ps(zmm0, zmm0, 0x55)); zmm3 = _mm512_mul_ps(_mm512_set4_ps(0.0f, 0.950f, 0.072f, 0.180f), _mm512_shuffle_ps(zmm0, zmm0, 0xAA)); zmm0 = _mm512_add_ps(zmm1, zmm2); zmm0 = _mm512_add_ps(zmm0, zmm3); zmm0 = _mm512_mask_max_ps(zmm0, 0x4444,_mm512_set1_ps(0.0f), zmm0); zmm0 = _mm512_mask_min_ps(zmm0, 0x4444,_mm512_set1_ps(1.0f), zmm0); _mm512_mask_compressstoreu_ps (pDst+3*i, 0x7777, zmm0 ); } pSrc1 = pSrc1 + src1Step; pDst = pDst + dstStep; } } #define ABS(A) (A)>=0.0f?(A):(-(A)) void avg_ref( float* pSrc1, int src1Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { for(i=0; i < width; i++ ) { float dv, dh; float valU = pSrc1[src1Step*(h - 1) + i ]; float valD = pSrc1[src1Step*(h + 1) + i ]; float valL = pSrc1[src1Step*( h ) + (i-1)]; float valR = pSrc1[src1Step*( h ) + (i+1)]; dv = ABS(valU - valD); dh = ABS(valL - valR); if(dv<=dh){ pDst[i] = (valU + valD) * 0.5f; //A branch } else { pDst[i] = (valL + valR) * 0.5f; //B branch } } pDst = pDst + dstStep; } } void avg_avx512( const float* pSrc1, int src1Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { i = 0; for(i=i; i < (width&(~15)); i+=16 ) { __m512 zvU, zvD, zvL, zvR; __m512 zdV, zdH; __m512 zavgV, zavgH, zavg; __mmask16 mskV; zvU = _mm512_loadu_ps(pSrc1+src1Step*(h - 1) + i ); zvD = _mm512_loadu_ps(pSrc1+src1Step*(h + 1) + i ); zvL = _mm512_loadu_ps(pSrc1+src1Step*( h ) + (i-1)); zvR = _mm512_loadu_ps(pSrc1+src1Step*( h ) + (i+1)); zdV = _mm512_sub_ps(zvU, zvD); zdH = _mm512_sub_ps(zvL, zvR); zdV = _mm512_abs_ps(zdV); zdH = _mm512_abs_ps(zdH); mskV = _mm512_cmp_ps_mask(zdV, zdH, _CMP_LE_OS); zavgV = _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_add_ps(zvU, zvD)); zavgH = _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_add_ps(zvL, zvR)); zavg = _mm512_mask_or_ps(zavgH, mskV, zavgV, zavgV); _mm512_storeu_ps(pDst + i, zavg); } //remainder skipped pDst = pDst + dstStep; } } void avg_ref( float* restrict pSrc1, int src1Step, float* restrict pDst, int dstStep, int width, int height) void avg_ref( float* restrict pSrc1, int src1Step, float* restrict pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { int t = 1; for(i=0; i < width; i++ ) { float dv, dh; float valU = pSrc1[src1Step*(h - 1) + i ]; float valD = pSrc1[src1Step*(h + 1) + i ]; float valL = pSrc1[src1Step*( h ) + (i-1)]; float valR = pSrc1[src1Step*( h ) + (i+1)]; dv = ABS(valU - valD); dh = ABS(valL - valR); if(dv<dh){ pDst[i] = (valU + valD) * 0.5f; t = 1; } else if (dv>dh){ pDst[i] = (valL + valR) * 0.5f; t = 0; } else if (t == 1) { pDst[i] = (valU + valD) * 0.5f; } else { pDst[i] = (valL + valR) * 0.5f; } } pDst = pDst + dstStep; } } 
void avg_avx512( const float* pSrc1, int src1Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { __mmask16 mskD = 0xFFFF; i = 0; for(i=i; i < (width&(~15)); i+=16 ) { __m512 zvU, zvD, zvL, zvR; __m512 zdV, zdH; __m512 zavgV, zavgH, zavg; __mmask16 mskV, mskE; zvU = _mm512_loadu_ps(pSrc1+src1Step*(h - 1) + i ); zvD = _mm512_loadu_ps(pSrc1+src1Step*(h + 1) + i ); zvL = _mm512_loadu_ps(pSrc1+src1Step*( h ) + (i-1)); zvR = _mm512_loadu_ps(pSrc1+src1Step*( h ) + (i+1)); zdV = _mm512_sub_ps(zvU, zvD); zdH = _mm512_sub_ps(zvL, zvR); zdV = _mm512_abs_ps(zdV); zdH = _mm512_abs_ps(zdH); mskV = _mm512_cmp_ps_mask(zdV, zdH, _CMP_LT_OS); mskE = _mm512_cmp_ps_mask(zdV, zdH, _CMP_EQ_OS); mskD = mskV | (mskE & (mskD >>15) & (1<<0)); // 0 bit mskD = mskD | (mskE & (mskD << 1) & (1<<1)); // 1 bit mskD = mskD | (mskE & (mskD << 1) & (1<<2)); // 2 bit mskD = mskD | (mskE & (mskD << 1) & (1<<3)); // 3 bit mskD = mskD | (mskE & (mskD << 1) & (1<<4)); // 4 bit mskD = mskD | (mskE & (mskD << 1) & (1<<5)); // 5 bit mskD = mskD | (mskE & (mskD << 1) & (1<<6)); // 6 bit mskD = mskD | (mskE & (mskD << 1) & (1<<7)); // 7 bit mskD = mskD | (mskE & (mskD << 1) & (1<<8)); // 8 bit mskD = mskD | (mskE & (mskD << 1) & (1<<9)); // 9 bit mskD = mskD | (mskE & (mskD << 1) & (1<<10)); // 10 bit mskD = mskD | (mskE & (mskD << 1) & (1<<11)); // 11 bit mskD = mskD | (mskE & (mskD << 1) & (1<<12)); // 12 bit mskD = mskD | (mskE & (mskD << 1) & (1<<13)); // 13 bit mskD = mskD | (mskE & (mskD << 1) & (1<<14)); // 14 bit mskD = mskD | (mskE & (mskD << 1) & (1 << 15)); // 15 bit zavgV = _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_add_ps(zvU, zvD)); zavgH = _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_add_ps(zvL, zvR)); zavg = _mm512_mask_or_ps(zavgH, mskD, zavgV, zavgV); _mm512_storeu_ps(pDst + i, zavg); } pDst = pDst + dstStep; } } unsigned char table[2*256 * 256]; extern init_table_mask() { int mskV, mskE, mskD=0; for (mskD = 0; mskD < 2; mskD++){ for (mskV = 0; mskV < 256; mskV++){ for (mskE = 0; mskE < 256; mskE++){ int msk; msk = mskV | (mskE & (mskD ) & (1 << 0)); // 0 bit msk = msk | (mskE & (msk << 1) & (1 << 1)); // 1 bit msk = msk | (mskE & (msk << 1) & (1 << 2)); // 2 bit msk = msk | (mskE & (msk << 1) & (1 << 3)); // 3 bit msk = msk | (mskE & (msk << 1) & (1 << 4)); // 4 bit msk = msk | (mskE & (msk << 1) & (1 << 5)); // 5 bit msk = msk | (mskE & (msk << 1) & (1 << 6)); // 6 bit msk = msk | (mskE & (msk << 1) & (1 << 7)); // 7 bit table[256*256*mskD+256 * mskE + mskV] = (unsigned char)msk; } } } } void avg_avx512( const float* pSrc1, int src1Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { __mmask16 mskD = 0xFFFF; i = 0; mskD = 0x00FF; for(i=i; i < (width&(~7)); i+=8 ) { __m512 Z = _mm512_setzero_ps(); __m512 zvU, zvD, zvL, zvR; __m512 zdV, zdH; __m512 zavgV, zavgH, zavg; __mmask16 mskV, mskE; zvU = _mm512_mask_loadu_ps(Z,0xFF, pSrc1+src1Step*(h - 1) + i ); zvD = _mm512_mask_loadu_ps(Z,0xFF, pSrc1+src1Step*(h + 1) + i ); zvL = _mm512_mask_loadu_ps(Z,0xFF, pSrc1+src1Step*( h ) + (i-1)); zvR = _mm512_mask_loadu_ps(Z,0xFF, pSrc1+src1Step*( h ) + (i+1)); zdV = _mm512_sub_ps(zvU, zvD); zdH = _mm512_sub_ps(zvL, zvR); zdV = _mm512_abs_ps(zdV); zdH = _mm512_abs_ps(zdH); mskV = _mm512_cmp_ps_mask(zdV, zdH, _CMP_LT_OS)&0xFF; mskE = _mm512_cmp_ps_mask(zdV, zdH, _CMP_EQ_OS)&0xFF; mskD = table[256 * 256 * (mskD >> 7) + 256 * mskE + mskV]; zavgV = _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_add_ps(zvU, zvD)); zavgH = _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_add_ps(zvL, zvR)); zavg = _mm512_mask_or_ps(zavgH, mskD, zavgV, zavgV); _mm512_mask_storeu_ps(pDst + i, 0xFF, zavg); } pDst = pDst + dstStep; } } void avg_avx512( const float* pSrc1, int src1Step, float* pDst, int dstStep, int width, int height) { int h, i; for( h = 0; h < height; h++ ) { __mmask16 mskD = 0xFFFF; i = 0; for(i=i; i < (width&(~15)); i+=16 ) { __m512 zvU, zvD, zvL, zvR; __m512 zdV, zdH; __m512 zavgV, zavgH, zavg; __mmask16 mskV, mskE; ushort mskD_0_7=0, mskD_8_15=0; zvU = _mm512_loadu_ps(pSrc1+src1Step*(h - 1) + i ); zvD = _mm512_loadu_ps(pSrc1+src1Step*(h + 1) + i ); zvL = _mm512_loadu_ps(pSrc1+src1Step*( h ) + (i-1)); zvR = _mm512_loadu_ps(pSrc1+src1Step*( h ) + (i+1)); zdV = _mm512_sub_ps(zvU, zvD); zdH = _mm512_sub_ps(zvL, zvR); zdV = _mm512_abs_ps(zdV); zdH = _mm512_abs_ps(zdH); mskV = _mm512_cmp_ps_mask(zdV, zdH, _CMP_LT_OS); mskE = _mm512_cmp_ps_mask(zdV, zdH, _CMP_EQ_OS); mskD_0_7 = table[256 * 256 * (((ushort)mskD) >> 15) + 256 * (((ushort)mskE) & 0xFF) + (((ushort)mskV) & 0xFF)]; mskD_8_15 = table[256 * 256 * (((ushort)mskD_0_7) >> 7) + 256 * (((ushort)mskE)>> 8 ) + (((ushort)mskV) >>8 )]; mskD = (mskD_8_15 << 8) | mskD_0_7; zavgV = _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_add_ps(zvU, zvD)); zavgH = _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_add_ps(zvL, zvR)); zavg = _mm512_mask_or_ps(zavgH, mskD, zavgV, zavgV); _mm512_storeu_ps(pDst + i, zavg); } pDst = pDst + dstStep; } } Source: https://habr.com/ru/post/266055/
All Articles