πŸ“œ ⬆️ ⬇️

Code optimization: processor

All programs must be correct, but some programs must be fast . If the program processes real-time video frames or network packets, performance is key. It is not enough to use efficient algorithms and data structures. You need to write code that the compiler easily optimizes and translates into fast executable code.

image

In this article we will look at basic code optimization techniques that can increase the performance of your program many times. We will also touch the processor device. Understanding how the processor works is necessary for writing effective programs.

I was inspired to write this article by the fifth chapter from Computer Systems: A Programmer's Perspective . All programs are tested on a machine with a Pentium 2117U processor , the performance on your machine may be different. In another article in this series, β€œCode Optimization: Memory,” we look at optimization techniques to make better use of the cache.

Optimization blockers


The compilers themselves are trying to optimize the code. When GCC is started with the -Og option, it performs the basic optimization level, with the -O1 option, the first optimization level. There are also higher levels of optimization: -O2, -O3, etc. The higher the level of optimization, the more radical the compiler makes changes to the program. Compilers can use only safe optimizations . This means that the compiler can only change the program so that it does not change its behavior for all input data.
')
We, as programmers, need to understand that there are certain characteristics of the code that will not allow the compiler to perform optimization. We call them optimization blockers . Consider two types of optimization blockers. One of them are pointers . The compiler cannot know for sure whether two pointers will point to the same memory area, and therefore does not perform some optimizations. Consider an example:

void twiddle1(long *xp, long *yp) {
    *xp += *yp;
    *xp += *yp;
}

void twiddle2(long *xp, long *yp) {
    *xp += 2*(*yp);
}

twiddle2 , ( *xp, *yp, *xp), , twiddle1 ( ). , . , xp yp . twiddle1 *xp , twiddle2 β€” . . .

β€” . , , . :

long f();

long func1() {
    return f() + f() + f() + f();
}

long func2() {
    return 4*f();
}

f , β€” . , . f , , :

long counter = 0;

long f() {
    return counter++;
}

func1 func2 . , . , .


. , . CPE (cycles per element). , CPE 2.5, 2.5 .

, . vec float. combine0 . . 5000 .

typedef struct {
    long len;
    float *data;
} vec;

long vec_len(vec *v) {
    return v->len;
}

void combine0(vec *v, float *dest)
{
    long i;
    *dest = 1;
    for (i = 0; i < vec_len(v); i++) {
        *dest *= v->data[i];
    }
}

#define SIZE 5000
float a[SIZE];
vec v = {SIZE, a};

int main() {
    float res;
    for (i = 0; i < SIZE; i++)  //    
        a[i] = rand();

    combine0(&v, &res);
}

GCC -Og ( ). , CPE 11.05.

CPE rdtsc, . . , . . , .

#include <time.h>
#include <stdio.h>

//        
static __inline__ unsigned long long rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

//  ,   
#define SIZE 100000000

int main(void)
{
    double time1 = clock() / (double)CLOCKS_PER_SEC;
    long cycles1 = rdtsc();
    //----------    ----------
    for (int i = 0; i < SIZE; i++) {
        i * i;
    }
    //----------------------
    double time2 = clock() / (double)CLOCKS_PER_SEC;
    long cycles2 = rdtsc();

    long cycles = cycles2 - cycles1;      //    
    double cpe = cycles / (double)SIZE;   //       
    double cpu_time = time2 - time1;      //   

    printf("CPU TIME: %.2f sec\n", cpu_time);
    printf("CPE:      %.2f\n", cpe);
}




, . . vec_len, . , . . .

void combine1(vec *v, float *dest)
{
    long i, len = vec_len(v);
    *dest = 1;
    for (i = 0; i < len; i++) {
        *dest *= v->data[i];
    }
}

β€” CPE 11.05. , . GCC -Og . : combine0 β€” CPE 12.7, combine1 β€” CPE 11.1.

. , , . - , . , :

void lower(char *s) {
    for (long i = 0; i < strlen(s); i++)
        if (s[i] >= 'A' && s[i] <= 'Z')
            s[i] -= ('A' - 'a'); 
}

strlen , . . , . , strlen , . , , . .


. . , . , , - .

. , dest, dest. . dest . -, , dest. - , .

void combine2(vec *v, float *dest)
{
    long i, len = vec_len(v);
    float acc = 1;
    for (i = 0; i < len; i++) {
        acc *= v->data[i];
    }
    *dest = acc;
}

. CPE 11.05 CPE 5.02. , , dest - . . , . .


CPE 5.02. , 5 . , .

int. , float, int, CPE 1.0? . CPE 1.58. ?

, , , , , : , , . , . , «» .

, , . «» , . , , . , :

void combine_plus(vec *v, int *dest)
{
    long i, limit = vec_len(v)-1;
    int acc = 0;
    for (i = 0; i < limit; i+=2) {
        acc = acc + v->data[i] + v->data[i+1];
    }
    if (i < len) acc += v->data[i];  //   ,    
    *dest = acc;
}

. , , . CPE 1.58 1.15. , CPE 1.03, , .

CPE 5.02, , , . , , , , . CPE 1.04 . , , .


. , , . , , , . , , , , . , . .

, , . : , , , . . , . , .

, 100 . , . , . , .

. ( , if-else), , . , , , . . 20 .


, , . , N , 5*N . , .

, - . . , . : , , . . . , . . , , N ( N ) N .

. , . , , . . , , . , 3-30 .


, CPE 5.02, CPE 1.02? . . , , , , , . . , . . , , , .

-. . , , . , . , . .

, . , . , , . , .

, . , . .

float combine3(float a[], long size)
{
    long i, limit = size-1;
    float acc0 = 1;
    float acc1 = 1;

    for (i = 0; i < limit; i+=2) {
        acc0 *= a[i];
        acc1 *= a[i+1];
    }
    while (i < size) acc0 *= a[i++];
    return acc0 * acc1;
}

, , , . CPE 5.02 2.5. :

float combine4(float a[], long size)
{
    long i, limit = size-3;;
    float acc0 = 1;
    float acc1 = 1;
    float acc2 = 1;
    float acc3 = 1;

    for (i = 0; i < limit; i+=4) {
        acc0 *= a[i];
        acc1 *= a[i+1];
        acc2 *= a[i+2];
        acc3 *= a[i+3];
    }
    while (i < size) acc0 *= a[i++];
    return acc0 * acc1 * acc2 * acc3;
}

CPE 1.28. CPE 1.04, , . .

, . Core i7 Haswell , CPE 0.5, . , C N , C*N.


, , , - . . GCC , . , . , , , , .

, , , . , . . float . , . , . , .

, .


. CPE 1.04, β€” . , SSE AVX, . , %ymm0-%ymm15, 16 32 . AVX 32 64- , 32- , . 32 , 32 .

, . GCC C, . , , , . , 4 8 , . , CPE 0.25 0.12.


, CPE 11.05. CPE 5.02. . , CPE 1.04. 11 . , 44-88 .

, , . , , , , . , , . CPE 5.02. CPE 1.04, , . . , 4-8 , .


, , . , . , . , , , CPE 2.0, . Core i7 Haswell , . , CPE 0.25. , β€” CPE 0.5.

. x86-64 16 16 YMM . , . , . 8 20, CPE 1.04 CPE 1.35.

, . , , . . .

:


, . - -, . .

for (long i = 0; i < limit; i+=3) {
        float x = a[i], y = a[i+1], z = a[i+2];
        acc = acc * x * y * z;
}

CPE 5.02. C, , . , , acc. , . -:

acc = acc * ((x * y) * z);

x, y z . , , . CPE 1.69.

:


, , . ( je, jg, jl . . ), . , . .

cmov. , - . , . . .

, , . . if-else, je, jg, jl . ., cmov. if-else:

if ()
    v = 1;
else
    v = 2;

, , , , . , :

v1 = 1;
v2 = 2;

v = () ? v1 : v2;

, - , cmov. , , , . , , , . , .

, , . , . .

void minmax1(int a[], int b[], int n)
{
    for (int i = 0; i < n; i++) {
        if (a[i] > b[i]) {
            int t = a[i];
            a[i] = b[i];
            b[i] = t;
        }
    }
}

void minmax2(int a[], int b[], int n)
{
    for (int i = 0; i < n; i++) {
        int min = a[i] < b[i] ? a[i] : b[i];
        int max = a[i] < b[i] ? b[i] : a[i];
        a[i] = min;
        b[i] = max;
    }
}

. , a , b . : , . min max , . , : minmax1 β€” CPE 15.6, minmax2 β€” CPE 1.18 ( 13.2 ).


. , . Β« Β» . : , , .

, . .

:

Source: https://habr.com/ru/post/309796/


All Articles