#include <stdio.h> typedef unsigned long long BN_WORD; int Add(BN_WORD c[8], BN_WORD a[8], BN_WORD b[8]) { c[0] = a[0] + b[0]; c[1] = a[1] + b[1] + (c[0] < a[0]); c[2] = a[2] + b[2] + (c[1] < a[1]); c[3] = a[3] + b[3] + (c[2] < a[2]); c[4] = a[4] + b[4] + (c[3] < a[3]); c[5] = a[5] + b[5] + (c[4] < a[4]); c[6] = a[6] + b[6] + (c[5] < a[5]); c[7] = a[7] + b[7] + (c[6] < a[6]); return (c[7] < a[7]); } int main(void) { int i; BN_WORD a[8], b[8], c[8], Carry; // , inline Add main int (*add)(BN_WORD c[8], BN_WORD a[8], BN_WORD b[8]) = Add; for( i=0; i<8; i++) { a[i] = b[i] = 0x8000000000000000; } Carry = add(c, a, b); // for( i=0; i<8; ++i) { if (Carry!=1 || c[i]!=(i!=0)) { printf("Something wrong\n"); return 1; } } printf("Ok!\n", Carry); return 0; }
?Add@@YAXQEA_K00@Z PROC ; Add, COMDAT ; Line 6 $LN3: mov QWORD PTR [rsp+8], rbx mov QWORD PTR [rsp+16], rdi ; Line 7 mov r10, QWORD PTR [rdx] ; Line 15 mov rdi, QWORD PTR [rsp+16] mov rbx, rdx add r10, QWORD PTR [r8] mov r11, r8 mov QWORD PTR [rcx], r10 cmp r10, QWORD PTR [rdx] mov r8, QWORD PTR [r8+8] adc r8, 0 add r8, QWORD PTR [rdx+8] mov QWORD PTR [rcx+8], r8 cmp r8, QWORD PTR [rdx+8] mov rdx, QWORD PTR [r11+16] adc rdx, 0 add rdx, QWORD PTR [rbx+16] mov QWORD PTR [rcx+16], rdx cmp rdx, QWORD PTR [rbx+16] mov rdx, QWORD PTR [r11+24] adc rdx, 0 add rdx, QWORD PTR [rbx+24] mov QWORD PTR [rcx+24], rdx cmp rdx, QWORD PTR [rbx+24] mov rdx, QWORD PTR [r11+32] adc rdx, 0 add rdx, QWORD PTR [rbx+32] mov QWORD PTR [rcx+32], rdx cmp rdx, QWORD PTR [rbx+32] mov rdx, QWORD PTR [r11+40] adc rdx, 0 add rdx, QWORD PTR [rbx+40] mov QWORD PTR [rcx+40], rdx cmp rdx, QWORD PTR [rbx+40] mov rdx, QWORD PTR [r11+48] adc rdx, 0 add rdx, QWORD PTR [rbx+48] mov QWORD PTR [rcx+48], rdx cmp rdx, QWORD PTR [rbx+48] mov rax, QWORD PTR [rbx+56] adc rax, QWORD PTR [r11+56] mov rbx, QWORD PTR [rsp+8] mov QWORD PTR [rcx+56], rax ret 0 ?Add@@YAXQEA_K00@Z ENDP ; Add _TEXT ENDS
add r8, QWORD PTR [rdx+8] mov QWORD PTR [rcx+8], r8 cmp r8, QWORD PTR [rdx+8] mov rdx, QWORD PTR [r11+16] adc rdx, 0
But why do we need an extra CMP? Excess CMP we do not need. It is used no more than to re-set the Carry flag, which itself has already been previously set by the ADD command and therefore you can safely delete the CMP. Let's try to take the liberty and shave off the “protruding mustache” in the form of extra CMPs by directly modifying the binary code in memory and measuring speed again. #include <stdio.h> #include <windows.h> typedef unsigned long long BN_WORD; int Add(BN_WORD c[8], BN_WORD a[8], BN_WORD b[8]) { c[0] = a[0] + b[0]; c[1] = a[1] + b[1] + (c[0] < a[0]); c[2] = a[2] + b[2] + (c[1] < a[1]); c[3] = a[3] + b[3] + (c[2] < a[2]); c[4] = a[4] + b[4] + (c[3] < a[3]); c[5] = a[5] + b[5] + (c[4] < a[4]); c[6] = a[6] + b[6] + (c[5] < a[5]); c[7] = a[7] + b[7] + (c[6] < a[6]); return (c[7] < a[7]); } void ReadMem(__int64 pos, char *patch, int len) { DWORD my_id = GetCurrentProcessId(); HANDLE p_hand = OpenProcess(PROCESS_VM_READ | PROCESS_VM_OPERATION, NULL, my_id); if (ReadProcessMemory(p_hand, (LPDWORD)pos, patch, len, NULL)==0) { printf("Error %d read from memory\n", GetLastError()); } CloseHandle(p_hand); } void WriteMem(__int64 pos, char *patch, int len) { DWORD my_id = GetCurrentProcessId(); HANDLE p_hand = OpenProcess(PROCESS_VM_WRITE | PROCESS_VM_OPERATION, NULL, my_id); if (WriteProcessMemory(p_hand, (LPDWORD)pos, patch, len, NULL)==0) { printf("Error %d write to memory\n", GetLastError()); } CloseHandle(p_hand); } void udalit_usi(int pos, int size, int full_size) { char *mem = (char*)malloc(full_size-pos); __int64 pos_add = (__int64)Add; // ReadMem(pos_add + pos, mem, full_size-pos); // , , size WriteMem(pos_add + pos, mem+size, full_size-pos-size); free(mem); } char *add_listing = " 00000000 ?Add@@YAHQEA_K00@Z PROC ; Add, COMDAT\n" " ; Line 7\n" " 00000000 $LN3:\n" " 00000000 48/ 89 5C 24 mov QWORD PTR [rsp+8], rbx\n" " 08\n" " 00000005 48/ 89 7C 24 mov QWORD PTR [rsp+16], rdi\n" " 10\n" " ; Line 8\n" " 0000000A 4C/ 8B 12 mov r10, QWORD PTR [rdx]\n" " ; Line 17\n" " 0000000D 48/ 8B 5C 24 mov rbx, QWORD PTR [rsp+8]\n" " 08\n" " 00000012 48/ 8B FA mov rdi, rdx\n" " 00000015 4D/ 03 10 add r10, QWORD PTR [r8]\n" " 00000018 4D/ 8B D8 mov r11, r8\n" " 0000001B 4C/ 89 11 mov QWORD PTR [rcx], r10\n" " 0000001E 4C/ 3B 12 cmp r10, QWORD PTR [rdx]\n" " 00000021 4D/ 8B 40 08 mov r8, QWORD PTR [r8+8]\n" " 00000025 49/ 83 D0 00 adc r8, 0\n" " 00000029 4C/ 03 42 08 add r8, QWORD PTR [rdx+8]\n" " 0000002D 4C/ 89 41 08 mov QWORD PTR [rcx+8], r8\n" " 00000031 4C/ 3B 42 08 cmp r8, QWORD PTR [rdx+8]\n" " 00000035 49/ 8B 53 10 mov rdx, QWORD PTR [r11+16]\n" " 00000039 48/ 83 D2 00 adc rdx, 0\n" " 0000003D 48/ 03 57 10 add rdx, QWORD PTR [rdi+16]\n" " 00000041 48/ 89 51 10 mov QWORD PTR [rcx+16], rdx\n" " 00000045 48/ 3B 57 10 cmp rdx, QWORD PTR [rdi+16]\n" " 00000049 49/ 8B 53 18 mov rdx, QWORD PTR [r11+24]\n" " 0000004D 48/ 83 D2 00 adc rdx, 0\n" " 00000051 48/ 03 57 18 add rdx, QWORD PTR [rdi+24]\n" " 00000055 48/ 89 51 18 mov QWORD PTR [rcx+24], rdx\n" " 00000059 48/ 3B 57 18 cmp rdx, QWORD PTR [rdi+24]\n" " 0000005D 49/ 8B 53 20 mov rdx, QWORD PTR [r11+32]\n" " 00000061 48/ 83 D2 00 adc rdx, 0\n" " 00000065 48/ 03 57 20 add rdx, QWORD PTR [rdi+32]\n" " 00000069 48/ 89 51 20 mov QWORD PTR [rcx+32], rdx\n" " 0000006D 48/ 3B 57 20 cmp rdx, QWORD PTR [rdi+32]\n" " 00000071 49/ 8B 53 28 mov rdx, QWORD PTR [r11+40]\n" " 00000075 48/ 83 D2 00 adc rdx, 0\n" " 00000079 48/ 03 57 28 add rdx, QWORD PTR [rdi+40]\n" " 0000007D 48/ 89 51 28 mov QWORD PTR [rcx+40], rdx\n" " 00000081 48/ 3B 57 28 cmp rdx, QWORD PTR [rdi+40]\n" " 00000085 49/ 8B 53 30 mov rdx, QWORD PTR [r11+48]\n" " 00000089 48/ 83 D2 00 adc rdx, 0\n" " 0000008D 48/ 03 57 30 add rdx, QWORD PTR [rdi+48]\n" " 00000091 48/ 89 51 30 mov QWORD PTR [rcx+48], rdx\n" " 00000095 48/ 3B 57 30 cmp rdx, QWORD PTR [rdi+48]\n" " 00000099 49/ 8B 53 38 mov rdx, QWORD PTR [r11+56]\n" " 0000009D 48/ 83 D2 00 adc rdx, 0\n" " 000000A1 33 C0 xor eax, eax\n" " 000000A3 48/ 03 57 38 add rdx, QWORD PTR [rdi+56]\n" " 000000A7 48/ 89 51 38 mov QWORD PTR [rcx+56], rdx\n" " 000000AB 48/ 3B 57 38 cmp rdx, QWORD PTR [rdi+56]\n" " 000000AF 48/ 8B 7C 24 mov rdi, QWORD PTR [rsp+16]\n" " 10\n" " 000000B4 0F 92 C0 setb al\n" " 000000B7 C3 ret 0\n" " 000000B8 ?Add@@YAHQEA_K00@Z ENDP ; Add\n" " 000000B8 _TEXT ENDS\n"; int full_size_add_listing = 0x000000B8; #define MAX_USI 100 int usi_pos[MAX_USI]; int usi_len[MAX_USI]; int kol_usi; void sbrivaem_usi(void) { char *p; int i, j; for( kol_usi=0, p=add_listing;;) // { // cmp p = strstr(p, "cmp"); if (p==NULL) break; for(;;p--) // 2 { if (p[0]==' ' && p[1]==' ') break; } p-=2; // sscanf(p, "%x", &(usi_pos[kol_usi])); p+=4; for(i=0;;i++) { if (p[0]<0x20) break; p+=2; if (p[0]=='/') p++; if (p[0]==' ') p++; } usi_len[kol_usi]=i; kol_usi++; if (kol_usi>=MAX_USI) { printf("too much"); exit(0); } p = strstr(p, "cmp"); // cmp p++; // } // for( i=kol_usi-1; i>=0; --i) { udalit_usi(usi_pos[i], usi_len[i], full_size_add_listing); full_size_add_listing -= usi_len[i]; } } int main(void) { LARGE_INTEGER lFrequency, lStart, lEnd; double dfTime1; int i, j, k, usi; BN_WORD a[8], b[8], c[8], Carry; // , inline Add main int (*add)(BN_WORD c[8], BN_WORD a[8], BN_WORD b[8]) = Add; for( i=0; i<8; i++) { a[i] = b[i] = 0x8000000000000000; } for( usi=0; usi<=1; ++usi) { QueryPerformanceFrequency(&lFrequency); QueryPerformanceCounter(&lStart); for( i=0; i<1000; ++i) { for( j=0; j<1000000; ++j) { Carry = add(c, a, b); } for( k=0; k<8; ++k) { if (Carry!=1 || c[k]!=(k!=0)) { printf("Something wrong\n"); return 1; } } } QueryPerformanceCounter(&lEnd); dfTime1 = (double)(lEnd.QuadPart - lStart.QuadPart) / (double)lFrequency.QuadPart; if (usi==0) { // printf("Time = %g sec\n", dfTime1); // sbrivaem_usi(); } else { // printf("Time(without cmp) = %g sec\n", dfTime1); } } return 0; }
call "C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\vcvarsall.bat" amd64 cl.exe -O2 /Faadd_with_carry.asm add_with_carry.cpp
Source: https://habr.com/ru/post/147400/
All Articles