Fast Sin and Cos on Delphi Embedded ASM

Hello!

There was a need to write a quick calculation of Sin and Cos. The basis for the calculations took the decomposition of the Taylor series. I use it in 3D systems (OpenGL and a graphical library of my development). Unfortunately, it is not possible to reduce the series “perfectly” for Double, but this is offset by good acceleration. The code is written in an assembler built into Delphi XE6. Used by SSE2.

Not suitable for scientific computing, but quite suitable for use in games.
There is enough accuracy to cover the digit capacity of the Single number, which is used
for multiplication by a matrix.
')
Eventually:

The achieved accuracy of the result is: 10.e-13
The maximum discrepancy with the CPU is 0.000000000000045.
The speed is increased in comparison with the CPU by 4.75 times.
Speed increased in comparison with Math.Sin and Math.Cos by 2.6 times.

For the test I used an Intel Core-i7 6950X Extreme 3.0 GHz processor.
Delphi source code is embedded in assembler comments.

Source:

Spoiler header

var gSinTab: array of Double; gSinAddr: UInt64; const AbsMask64: UInt64 = ($7FFFFFFFFFFFFFFF); PI90: Double = (PI / 2.0); FSinTab: array[0..7] of Double = (1.0 / 6.0, //3! 1.0 / 120.0, //5! 1.0 / 5040.0, //7! 1.0 / 362880.0, //9! 1.0 / 39916800.0, //11! 1.0 / 6227020800.0, //13! 1.0 / 1307674368000.0, //15! 1.0 / 355687428096000.0); //17! //  Sin    QSinMaxP: array[0..3] of Double = (0.0, 1.0, 0.0, -1.0); //  Sin    QSinMaxM: array[0..3] of Double = (0.0, -1.0, 0.0, 1.0); //     QSinSignP: array[0..3] of Double = (1.0, 1.0, -1.0, -1.0); //     QSinSignM: array[0..3] of Double = (-1.0, -1.0, 1.0, 1.0); function Sinf(const A: Double): Double; asm .NOFRAME //   A  xmm0 //   // X := IntFMod(Abs(A), PI90, D); // Result := FNum - (Trunc(FNum / FDen) * FDen); pxor xmm3, xmm3 comisd A, xmm3 jnz @ANZ ret // Result := 0.0; @ANZ: //   //Minus := (A < 0.0); setc cl movsd xmm1, [AbsMask64] //Abs(A) pand A, xmm1 movsd xmm1, [PI90] //PI90 = FDen movsd xmm2, A // A = FNum divsd xmm2, xmm1 //(FNum / FDen) roundsd xmm2, xmm2, 11b //11b - Trunc cvtsd2si rax, xmm2 //  (X  EAX) mulsd xmm2, xmm1 subsd xmm0, xmm2 //----------------------------------- //  and rax, 3 // D := (D and 3); //----------------------------------- //     // if (X = 0.0) then // begin // if Minus then // Exit(QSinMaxM[D]) else // Exit(QSinMaxP[D]); // end; comisd xmm0, xmm3 jnz @XNZ shl rax, 3 //     Double (8 ) cmp cl, 1 //  ,   je @MaxMinus @MaxPlus: lea rdx, qword ptr [QSinMaxP] movsd xmm0, qword ptr [rdx + rax] ret @MaxMinus: lea rdx, qword ptr [QSinMaxM] movsd xmm0, qword ptr [rdx + rax] ret //----------------------------------- @XNZ: //        // if (D and 1) <> 0 then X := (PI90 - X); mov edx, eax and edx, 1 cmp edx, 0 je @DZ subsd xmm1, xmm0 movsd xmm0, xmm1 //----------------------------------- @DZ: // Result   xmm0 // X  xmm0 // N := (X * X)  xmm2 // F := (N * X)  xmm1 // N movsd xmm2, xmm0 //  X mulsd xmm2, xmm2 // N := (X * X) // F movsd xmm1, xmm2 //  N mulsd xmm1, xmm0 // F := (N * X) //     mov rdx, [gSinTab] movaps xmm4, dqword ptr [rdx] movaps xmm6, dqword ptr [rdx + 16] movaps xmm8, dqword ptr [rdx + 32] movaps xmm10, dqword ptr [rdx + 48] //    movhlps xmm5, xmm4 movhlps xmm7, xmm6 movhlps xmm9, xmm8 movhlps xmm11, xmm10 // Result := X - F * PDouble(gSinAddr)^; mulsd xmm4, xmm1 //FSinTab[0] subsd xmm0, xmm4 // F := (F * N); // Result := Result + F * PDouble(gSinAddr + 8)^; mulsd xmm1, xmm2 mulsd xmm5, xmm1 //FSinTab[1] addsd xmm0, xmm5 // F := (F * N); // Result := Result - F * PDouble(gSinAddr + 16)^; mulsd xmm1, xmm2 mulsd xmm6, xmm1 //FSinTab[2] subsd xmm0, xmm6 // F := (F * N); // Result := Result + F * PDouble(gSinAddr + 24)^; mulsd xmm1, xmm2 mulsd xmm7, xmm1 //FSinTab[3] addsd xmm0, xmm7 // F := (F * N); // Result := Result - F * PDouble(gSinAddr + 32)^; mulsd xmm1, xmm2 mulsd xmm8, xmm1 //FSinTab[4] subsd xmm0, xmm8 // F := (F * N); // Result := Result + F * PDouble(gSinAddr + 40)^; mulsd xmm1, xmm2 mulsd xmm9, xmm1 //FSinTab[5] addsd xmm0, xmm9 // F := (F * N); // Result := Result - F * PDouble(gSinAddr + 48)^; mulsd xmm1, xmm2 mulsd xmm10, xmm1 //FSinTab[6] subsd xmm0, xmm10 // F := (F * N); // Result := Result + F * PDouble(gSinAddr + 56)^; mulsd xmm1, xmm2 mulsd xmm11, xmm1 //FSinTab[7] addsd xmm0, xmm11 //----------------------------------- // if Minus then // Result := (Result * QSinSignM[D]) else // Result := (Result * QSinSignP[D]); shl rax, 3 //  8 cmp cl, 1 je @Minus lea rdx, qword ptr [QSinSignP] mulsd xmm0, qword ptr [rdx + rax] ret @Minus: lea rdx, qword ptr [QSinSignM] mulsd xmm0, qword ptr [rdx + rax] end; //------------------------------------ function Cosf(const A: Double): Double; asm .NOFRAME // A  xmm0 //   // X := IntFMod(AMod, PI90, D); // Result := FNum - (Trunc(FNum / FDen) * FDen); pxor xmm3, xmm3 comisd A, xmm3 jnz @ANZ movsd xmm0, [DoubleOne] ret // Result := 1.0; //----------------------------------- @ANZ: //   //Minus := (A < 0.0); setc cl movsd xmm1, [AbsMask64] //Abs(A) pand A, xmm1 //----------------------------------- movsd xmm1, [PI90] //PI90 = FDen //----------------------------------- // if Minus then // AMod := Abs(A) - PI90 else // AMod := Abs(A) + PI90; cmp cl, 1 je @SubPI90 addsd A, xmm1 jmp @IntFMod @SubPI90: subsd A, xmm1 //----------------------------------- @IntFMod: movsd xmm2, A // A = FNum divsd xmm2, xmm1 //(FNum / FDen) roundsd xmm2, xmm2, 11b //11b - Trunc cvtsd2si rax, xmm2 //  (X  EAX) mulsd xmm2, xmm1 subsd xmm0, xmm2 //----------------------------------- //  and rax, 3 // D := (D and 3); //----------------------------------- //     // if (X = 0.0) then // begin // if Minus then // Exit(QSinMaxM[D]) else // Exit(QSinMaxP[D]); // end; comisd xmm0, xmm3 jnz @XNZ shl rax, 3 //     Double (8 ) cmp cl, 1 //  ,   je @MaxMinus @MaxPlus: lea rdx, qword ptr [QSinMaxP] movsd xmm0, qword ptr [rdx + rax] ret @MaxMinus: lea rdx, qword ptr [QSinMaxM] movsd xmm0, qword ptr [rdx + rax] ret //----------------------------------- @XNZ: //        // if (D and 1) <> 0 then X := (PI90 - X); mov edx, eax and edx, 1 cmp edx, 0 je @DZ subsd xmm1, xmm0 movsd xmm0, xmm1 @DZ: // Result   xmm0 // X  xmm0 // N := (X * X)  xmm2 // F := (N * X)  xmm1 // N movsd xmm2, xmm0 //  X mulsd xmm2, xmm2 // N := (X * X) // F movsd xmm1, xmm2 //  N mulsd xmm1, xmm0 // F := (N * X) //     mov rdx, [gSinTab] movaps xmm4, dqword ptr [rdx] movaps xmm6, dqword ptr [rdx + 16] movaps xmm8, dqword ptr [rdx + 32] movaps xmm10, dqword ptr [rdx + 48] //    movhlps xmm5, xmm4 movhlps xmm7, xmm6 movhlps xmm9, xmm8 movhlps xmm11, xmm10 // Result := X - F * PDouble(gSinAddr)^; mulsd xmm4, xmm1 //FSinTab[0] subsd xmm0, xmm4 // F := (F * N); // Result := Result + F * PDouble(gSinAddr + 8)^; mulsd xmm1, xmm2 mulsd xmm5, xmm1 //FSinTab[1] addsd xmm0, xmm5 // F := (F * N); // Result := Result - F * PDouble(gSinAddr + 16)^; mulsd xmm1, xmm2 mulsd xmm6, xmm1 //FSinTab[2] subsd xmm0, xmm6 // F := (F * N); // Result := Result + F * PDouble(gSinAddr + 24)^; mulsd xmm1, xmm2 mulsd xmm7, xmm1 //FSinTab[3] addsd xmm0, xmm7 // F := (F * N); // Result := Result - F * PDouble(gSinAddr + 32)^; mulsd xmm1, xmm2 mulsd xmm8, xmm1 //FSinTab[4] subsd xmm0, xmm8 // F := (F * N); // Result := Result + F * PDouble(gSinAddr + 40)^; mulsd xmm1, xmm2 mulsd xmm9, xmm1 //FSinTab[5] addsd xmm0, xmm9 // F := (F * N); // Result := Result - F * PDouble(gSinAddr + 48)^; mulsd xmm1, xmm2 mulsd xmm10, xmm1 //FSinTab[6] subsd xmm0, xmm10 // F := (F * N); // Result := Result + F * PDouble(gSinAddr + 56)^; mulsd xmm1, xmm2 mulsd xmm11, xmm1 //FSinTab[7] addsd xmm0, xmm11 //----------------------------------- // if Minus then // Result := (Result * QSinSignM[D]) else // Result := (Result * QSinSignP[D]); shl rax, 3 //  8 cmp cl, 1 je @Minus lea rdx, qword ptr [QSinSignP] mulsd xmm0, qword ptr [rdx + rax] ret @Minus: lea rdx, qword ptr [QSinSignM] mulsd xmm0, qword ptr [rdx + rax] end; Initialization //  SetLength(gSinTab, 8); //  gSinAddr := UInt64(@FSinTab[0]); //     Move(FSinTab[0], gSinTab[0], SizeOf(Double) * 8); Finalization SetLength(gSinTab, 0);

Work example here

Constructive suggestions and comments are welcome.

Source: https://habr.com/ru/post/429786/

All Articles

Fast Sin and Cos on Delphi Embedded ASM

More articles: