// ready to use with glSubData for vertex buffer struct PN { Math::Vec3f p; Math::Vec3f n; };
forceinline void transformPointNormal4x3Weight_NoW(const Matrix44f& mat,const Vec3f& inV, const Vec3f& inN, BaseRenderScene::PN& outPN) { outPN.p.vec[0] = (inV.vec[0]*mat.mat[0][0] + inV.vec[1]*mat.mat[1][0] + inV.vec[2]*mat.mat[2][0] + mat.mat[3][0]); outPN.n.vec[0] = (inN.vec[0]*mat.mat[0][0] + inN.vec[1]*mat.mat[1][0] + inN.vec[2]*mat.mat[2][0]); outPN.p.vec[1] = (inV.vec[0]*mat.mat[0][1] + inV.vec[1]*mat.mat[1][1] + inV.vec[2]*mat.mat[2][1] + mat.mat[3][1]); outPN.n.vec[1] = (inN.vec[0]*mat.mat[0][1] + inN.vec[1]*mat.mat[1][1] + inN.vec[2]*mat.mat[2][1]); outPN.p.vec[2] = (inV.vec[0]*mat.mat[0][2] + inV.vec[1]*mat.mat[1][2] + inV.vec[2]*mat.mat[2][2] + mat.mat[3][2]); outPN.n.vec[2] = (inN.vec[0]*mat.mat[0][2] + inN.vec[1]*mat.mat[1][2] + inN.vec[2]*mat.mat[2][2]); } forceinline void transformPointNormal4x3Weight(const Matrix44f& mat,const Vec3f& inV, const Vec3f& inN, BaseRenderScene::PN& outPN,float w ) { outPN.p.vec[0] = (inV.vec[0]*mat.mat[0][0] + inV.vec[1]*mat.mat[1][0] + inV.vec[2]*mat.mat[2][0] + mat.mat[3][0])*w; outPN.n.vec[0] = (inN.vec[0]*mat.mat[0][0] + inN.vec[1]*mat.mat[1][0] + inN.vec[2]*mat.mat[2][0])*w; outPN.p.vec[1] = (inV.vec[0]*mat.mat[0][1] + inV.vec[1]*mat.mat[1][1] + inV.vec[2]*mat.mat[2][1] + mat.mat[3][1])*w; outPN.n.vec[1] = (inN.vec[0]*mat.mat[0][1] + inN.vec[1]*mat.mat[1][1] + inN.vec[2]*mat.mat[2][1])*w; outPN.p.vec[2] = (inV.vec[0]*mat.mat[0][2] + inV.vec[1]*mat.mat[1][2] + inV.vec[2]*mat.mat[2][2] + mat.mat[3][2])*w; outPN.n.vec[2] = (inN.vec[0]*mat.mat[0][2] + inN.vec[1]*mat.mat[1][2] + inN.vec[2]*mat.mat[2][2])*w; } forceinline void transformPointNormal4x3AddWeighted(const Matrix44f& mat,const Vec3f& inV, const Vec3f& inN, BaseRenderScene::PN& outPN,float w ) { outPN.p.vec[0] += (inV.vec[0]*mat.mat[0][0] + inV.vec[1]*mat.mat[1][0] + inV.vec[2]*mat.mat[2][0] + mat.mat[3][0])*w; outPN.n.vec[0] += (inN.vec[0]*mat.mat[0][0] + inN.vec[1]*mat.mat[1][0] + inN.vec[2]*mat.mat[2][0])*w; outPN.p.vec[1] += (inV.vec[0]*mat.mat[0][1] + inV.vec[1]*mat.mat[1][1] + inV.vec[2]*mat.mat[2][1] + mat.mat[3][1])*w; outPN.n.vec[1] += (inN.vec[0]*mat.mat[0][1] + inN.vec[1]*mat.mat[1][1] + inN.vec[2]*mat.mat[2][1])*w; outPN.p.vec[2] += (inV.vec[0]*mat.mat[0][2] + inV.vec[1]*mat.mat[1][2] + inV.vec[2]*mat.mat[2][2] + mat.mat[3][2])*w; outPN.n.vec[2] += (inN.vec[0]*mat.mat[0][2] + inN.vec[1]*mat.mat[1][2] + inN.vec[2]*mat.mat[2][2])*w; }
const Vec3f& vx = pVerticies[v]; const Vec3f& vxN = pNormals[v]; float w = pVertexWeight[v].vec[0]; int boneIndex = pVertexBones[v].vec[0]; const Matrix44f& boneTM = pBoneTMList[boneIndex]; if( wCount==1 ) { transformPointNormal4x3Weight_NoW(boneTM,vx,vxN,skinTempPN[v]); } else { // 1st vertex without add transformPointNormal4x3Weight_N(boneTM,vx,vxN,skinTempPN[v],w); for(size_t i=1;i<wCount;i++) { // other verticies w = pVertexWeight[v].vec[i]; boneIndex = pVertexBones[v].vec[i]; const Matrix44f& boneTM = pBoneTMList[boneIndex]; transformPointNormal4x3AddWeighted_N(boneTM,vx,vxN,skinTempPN[v],w); } }
#if defined(__ARM_NEON__) #define USE_NEON #endif #if defined(USE_NEON) #ifdef __thumb__ #error "This file should be compiled in ARM mode only." // Note in Xcode, right click file, Get Info->Build, Other compiler flags = "-marm" #endif #define OP "q0" #define OPS0 "s0" #define OPS1 "s1" #define OPS2 "s2" #define ON "q1" #define ONS0 "s4" #define ONS1 "s5" #define ONS2 "s6" #define IP "q2" #define IN "q3" #define IPX "d4[0]" #define IPY "d4[1]" #define IPZ "d5[0]" #define IPW "d5[1]" #define INX "d6[0]" #define INY "d6[1]" #define INZ "d7[0]" #define INW "d7[1]" #define WQ "q4" #define W0D "d8[0]" #define W1D "d8[1]" #define W2D "d9[0]" #define W3D "d9[1]" #define QM0 q8 #define QM1 q9 #define QM2 q10 #define QM3 q11 #define QT "q14" // outP = mt.row0*pos + mt.row1*pos + mt.row2*pos + mt.row3*pos #define mat_pos(_RES) \ "vmul.f32 " _RES ", q8, " IPX "\n\t" \ "vmla.f32 " _RES ", q9, " IPY "\n\t" \ "vmla.f32 " _RES ", q10, " IPZ "\n\t" \ "vmla.f32 " _RES ", q11, " IPW "\n\t" #define mat_pos_w_set(_RES,_QT,_WD) \ mat_pos(_QT) \ "vmul.f32 " _RES ", " _QT ", " _WD "\n\t" #define mat_pos_w_add(_RES,_QT,_WD) \ mat_pos(_QT) \ "vmla.f32 " _RES ", " _QT ", " _WD "\n\t" // outN = mt.row0*nor + mt.row1*nor + mt.row2*nor #define mat_nor(_RES) \ "vmul.f32 " _RES ", q8, " INX "\n\t" \ "vmla.f32 " _RES ", q9, " INY "\n\t" \ "vmla.f32 " _RES ", q10, " INZ "\n\t" #define mat_nor_w_set(_RES,_QT,_WD) \ mat_nor(_QT) \ "vmul.f32 " _RES ", " _QT ", " _WD "\n\t" #define mat_nor_w_add(_RES,_QT,_WD) \ mat_nor(_QT) \ "vmla.f32 " _RES ", " _QT ", " _WD "\n\t" #define STORE3_P3N3(_R) \ "fsts "OPS0",[" _R "] \n\t" \ "fsts "OPS1",[" _R ",#4] \n\t" \ "fsts "OPS2",[" _R ",#8] \n\t" \ "fsts "ONS0",[" _R ",#12] \n\t" \ "fsts "ONS1",[" _R ",#16] \n\t" \ "fsts "ONS2",[" _R ",#20] \n\t" #define mat_load(_R) \ "vldmia " _R ", { q8-q11 } \n\t" __attribute__((always_inline)) void clalcSkin1( const Matrix44f* mat0, const Vec4f* posnorm, Vec3f* outPN) { // asm volatile ( // q4-q7 need to be preserved "vldmia %1, { " IP " - " IN " } \n\t" // pos norm // OP p temp // ON n temp // // mat0 mat_load("%0") mat_pos(OP) mat_nor(ON) STORE3_P3N3("%2") : // no output : "r" (mat0), "r" (posnorm), "r" (outPN) : "memory", IP, IN, WQ, QT, OP, ON, "q8", "q9", "q10", "q11" //clobber ); } __attribute__((always_inline)) void clalcSkin2( const Matrix44f* mat0, const Matrix44f* mat1, const Vec4f* posnorm, const Vec4f* weight, Vec3f* outPN) { // asm volatile ( // q4-q7 need to be preserved "vmov q15," WQ "\n\t" // "vldmia %2, { " IP " - " IN " } \n\t" // pos norm "vldmia %3, { " WQ " } \n\t" // weights // QT intermediate temp // OP p temp // ON n temp // // mat0 mat_load("%0") mat_pos_w_set(OP,QT,W0D) mat_nor_w_set(ON,QT,W0D) // mat 1 mat_load("%1") mat_pos_w_add(OP,QT,W1D) mat_nor_w_add(ON,QT,W1D) // output pos3f,norm3f STORE3_P3N3("%4") // restore q4 (WQ) "vmov " WQ ", q15 \n\t" : // no output : "r" (mat0), "r" (mat1), "r" (posnorm), "r" (weight), "r" (outPN) : "memory", IP, IN, WQ, QT , OP, ON, "q8", "q9", "q10", "q11", "q15" //clobber ); } __attribute__((always_inline)) void clalcSkin3( const Matrix44f* mat0, const Matrix44f* mat1, const Matrix44f* mat2, const Vec4f* posnorm, const Vec4f* weight, Vec3f* outPN) { // asm volatile ( // q4-q7 need to be preserved "vmov q15," WQ "\n\t" // "vldmia %3, { " IP " - " IN " } \n\t" // pos norm "vldmia %4, { " WQ " } \n\t" // weights // QT intermediate temp // OP p temp // ON n temp // // mat0 mat_load("%0") mat_pos_w_set(OP,QT,W0D) mat_nor_w_set(ON,QT,W0D) // mat 1 mat_load("%1") mat_pos_w_add(OP,QT,W1D) mat_nor_w_add(ON,QT,W1D) // mat 2 mat_load("%2") mat_pos_w_add(OP,QT,W2D) mat_nor_w_add(ON,QT,W2D) // output pos,normal STORE3_P3N3("%5") // restore q4 (WQ) "vmov " WQ ", q15 \n\t" : // no output : "r" (mat0), "r" (mat1), "r" (mat2),"r" (posnorm), "r" (weight), "r" (outPN) : "memory", IP, IN, WQ, QT, OP, ON, "q8", "q9", "q10", "q11", "q15" //clobber ); } __attribute__((always_inline)) void clalcSkin4( const Matrix44f* mat0, const Matrix44f* mat1, const Matrix44f* mat2, const Matrix44f* mat3, const Vec4f* posnorm, const Vec4f* weight, Vec3f* outPN) { // asm volatile ( // q4-q7 need to be preserved "vmov q15," WQ "\n\t" // "vldmia %4, { " IP " - " IN " } \n\t" // pos norm "vldmia %5, { " WQ " } \n\t" // weights // QT intermediate temp // OP p temp // ON n temp // // mat0 mat_load("%0") mat_pos_w_set(OP,QT,W0D) mat_nor_w_set(ON,QT,W0D) // mat 1 mat_load("%1") mat_pos_w_add(OP,QT,W1D) mat_nor_w_add(ON,QT,W1D) // mat 2 mat_load("%2") mat_pos_w_add(OP,QT,W2D) mat_nor_w_add(ON,QT,W2D) // mat 3 mat_load("%3") mat_pos_w_add(OP,QT,W3D) mat_nor_w_add(ON,QT,W3D) // output pos,normal STORE3_P3N3("%6") // restore q4 (WQ) "vmov " WQ ", q15\n\t" : // no output : "r" (mat0), "r" (mat1), "r" (mat2), "r" (mat3), "r" (posnorm), "r" (weight), "r" (outPN) : "memory", IP, IN, WQ, QT, OP, ON, "q8", "q9", "q10", "q11", "q15" //clobber ); }
Source: https://habr.com/ru/post/153015/
All Articles