Pro optimization techniques

An instructive story about optimization techniques clearly.

Terms of Reference

We will announce a small competition on architecture-oriented software optimization as part of the topic.
In short, the code consists of a bundle of functions that produce, at first glance, vague manipulations with the original data and a driver gadget that runs a non-optimized version n times, then optimized, compares the counted digits, and, if they coincide, returns a runtime relationship. Like this:

Executing original code ... done
Executing optimized code ... done
Checking results ... PASSED
Number of runs: 3
Original code average time: 11.954537 sec.
Optimized code average time: 1.052994 sec.
Speedup: 11.35
')
It is allowed to use any optimization techniques, a GCC compiler with any options, and, say, a server with two quad-core Intel Xeon E5420 2.5 GHz processors.
Here, by the way, the code:

/* * contestopt.c: */ #include "contest.h" void function3( double a[][NUM], double b[][NUM], double c[][NUM]); void function2( double a[][NUM]); double function4(unsigned int second); double ***p; double ****x; double ****y; double ****z; double ***wrk1; double ***wrk2; double ***bnd; static double a[NUM][NUM], b[NUM][NUM]; void function1( double result[][NUM], double gos[2], unsigned int second) { function2(a); function2(b); function3(a, b, result); gos[0] = function4(second); } void function3( double a[][NUM], double b[][NUM], double result[][NUM]) { int i, j, k; for (i = 0; i < NUM; i++) { for (j = 0; j < NUM; j++) { for (k = 0; k < NUM; k++) { result[i][j] = result[i][j] + a[i][k] * b[k][j]; } } } } void function2( double a[][NUM]) { int i, j; double first = 0.0; double second = 1.0; a[0][0] = first; a[0][1] = second; for (i = 0; i < NUM; i++) { for (j = 0; j < NUM; j++) { if (!(i == 0 && (j == 0 || j == 1))) { a[i][j] = first + second; first = second; second = a[i][j]; } if (j % 20 == 0 && j != 0) { first = first * (j + 1) / (NUM); second = second * (j + 1) / (NUM); } } first = ((i + 1) * NUM) / first; second = ((i + 1) * NUM) / second; } } void function5( double ***A, double val) { int i, j, k; for (i = 0; i < XX; i++) for (j = 0; j < YY; j++) for (k = 0; k < ZZ; k++) A[i][j][k] = val; } void function6(unsigned int second) { int mimax = XX; int mjmax = YY; int mkmax = ZZ; int imax = mimax - 1; int jmax = mjmax - 1; int kmax = mkmax - 1; int i, j, k, l; double val; srand((unsigned int ) second); p = ( double ***) malloc( sizeof ( double **) * XX); wrk1 = ( double ***) malloc( sizeof ( double **) * XX); wrk2 = ( double ***) malloc( sizeof ( double **) * XX); bnd = ( double ***) malloc( sizeof ( double **) * XX); x = ( double ****) malloc( sizeof ( double ***) * XX); y = ( double ****) malloc( sizeof ( double ***) * XX); z = ( double ****) malloc( sizeof ( double ***) * XX); for (i = 0; i < XX; i++) { p[i] = ( double **) malloc( sizeof ( double *) * YY); wrk1[i] = ( double **) malloc( sizeof ( double *) * YY); wrk2[i] = ( double **) malloc( sizeof ( double *) * YY); bnd[i] = ( double **) malloc( sizeof ( double *) * YY); x[i] = ( double ***) malloc( sizeof ( double **) * YY); y[i] = ( double ***) malloc( sizeof ( double **) * YY); z[i] = ( double ***) malloc( sizeof ( double **) * YY); for (j = 0; j < YY; j++) { p[i][j] = ( double *) malloc( sizeof ( double ) * ZZ); wrk1[i][j] = ( double *) malloc( sizeof ( double ) * ZZ); wrk2[i][j] = ( double *) malloc( sizeof ( double ) * ZZ); bnd[i][j] = ( double *) malloc( sizeof ( double ) * ZZ); x[i][j] = ( double **) malloc( sizeof ( double *) * ZZ); y[i][j] = ( double **) malloc( sizeof ( double *) * ZZ); z[i][j] = ( double **) malloc( sizeof ( double *) * ZZ); for (k = 0; k < ZZ; k++) { x[i][j][k] = ( double *) malloc( sizeof ( double ) * 4); y[i][j][k] = ( double *) malloc( sizeof ( double ) * 3); z[i][j][k] = ( double *) malloc( sizeof ( double ) * 3); } } } val = ( double ) rand() / (10.0 * RAND_MAX); for (i = 0; i < XX; i++) for (j = 0; j < YY; j++) for (k = 0; k < ZZ; k++) { for (l = 0; l < 3; l++) { x[i][j][k][l] = 1.0; y[i][j][k][l] = val; z[i][j][k][l] = 1.0; } x[i][j][k][3] = 1.0; } function5(bnd, 1.0); function5(wrk1, 0.0); function5(wrk2, 0.0); for (i = 0; i < imax; i++) for (j = 0; j < jmax; j++) for (k = 0; k < kmax; k++) p[i][j][k] = ( double ) (k * k) / ( double ) (kmax * kmax); } void function7() { int i, j, k; for (i = 0; i < XX; i++) { for (j = 0; j < YY; j++) { free(p[i][j]); free(wrk1[i][j]); free(wrk2[i][j]); free(bnd[i][j]); for (k = 0; k < ZZ; k++) { free(x[i][j][k]); free(y[i][j][k]); free(z[i][j][k]); } free(x[i][j]); free(y[i][j]); free(z[i][j]); } free(p[i]); free(wrk1[i]); free(wrk2[i]); free(bnd[i]); free(x[i]); free(y[i]); free(z[i]); } free(x); free(y); free(z); free(p); free(wrk1); free(wrk2); free(bnd); } double function4(unsigned int second) { int nn = NN; double gosa, s0, ss; int mimax = XX; int mjmax = YY; int mkmax = ZZ; int imax = mimax - 1; int jmax = mjmax - 1; int kmax = mkmax - 1; int iCount, jCount, kCount, loop, i, j, k; function6(second); for (loop = 0; loop < nn; loop++) { gosa = 0.0; for (k = 1; k < kmax - 1; k++) for (j = 1; j < jmax - 1; j++) for (i = 1; i < imax - 1; i++) { s0 = x[i][j][k][0] * p[i + 1][j][k] + x[i][j][k][1] * p[i][j][k] + x[i][j][k][2] * p[i][j][k] + y[i][j][k][0] * (p[i + 1][j + 1][k] - p[i + 1][j - 1][k] - p[i - 1][j + 1][k] + p[i - 1][j - 1][k]) + y[i][j][k][1] * (p[i][j + 1][k + 1] - p[i][j - 1][k + 1] - p[i][j + 1][k - 1] + p[i][j - 1][k - 1]) + y[i][j][k][2] * (p[i + 1][j][k + 1] - p[i - 1][j][k + 1] - p[i + 1][j][k - 1] + p[i - 1][j][k - 1]) + z[i][j][k][0] * p[i - 1][j][k] + z[i][j][k][1] * p[i][j - 1][k] + z[i][j][k][2] * p[i][j][k - 1] + wrk1[i][j][k]; ss = (s0 * x[i][j][k][3] - p[i][j][k]) * bnd[i][j][k]; gosa = gosa + ss; wrk2[i][j][k] = p[i][j][k] + 0.1 * ss; } for (iCount = 1; iCount < imax - 1; iCount++) for (jCount = 1; jCount < jmax - 1; jCount++) for (kCount = 1; kCount < kmax - 1; kCount++) p[iCount][jCount][kCount] = wrk2[iCount][jCount][kCount]; } function7(); return gosa; } double exponent( int i, double x) { double z = 1.0; int j; for (j = 0; j < i; j++) { z = z * x; } return z; } double function8( float *a, float p) { int i, j; double h, x; double s = p; h = 1.0 / VAL; for (i = 1; i <= VAL; i++) { x = h * (i - 0.5); for (j = 0; j < ORDER; j++) { s = (a[j] * exponent(j, x)) + s; } } return s * h; } * This source code was highlighted with Source Code Highlighter .

Tested on values:
#define NUM 600
#define XX 65
#define YY 33
#define ZZ 33
#define NN 50
#define VAL 10,000,000
#define ORDER 30
#define TOLERANCE 1e-8

Approximate driver view:

/* * driver.c: Contest driver. */ #include <sys/time.h> #include <stdio.h> #include <stdlib.h> #include <inttypes.h> #include <time.h> #include "contest.h" #include "hpctimer.h" double _RES[NUM][NUM] = { 0.0 }; double _GOS[2] = { 0.0, 0.0 }; double RES[NUM][NUM] = { 0.0 }; double GOS[2] = { 0.0, 0.0 }; float A[ORDER] = { 0.0 }; double function8( float *a, float p); void function1( double result[][NUM], double gos[2], unsigned int second); int is_results_eq( double _res[][NUM], double _gos[2], float _p, double res[][NUM], double gos[2], float p) { int i, j; if (fabsf(_p - p) > 1E-1) { printf( "_p p\n" ); printf( "%f\n" , _p); printf( "%f\n" , p); return 0; } if (fabs(_gos[0] - gos[0]) > 1E-1) { printf( "_gos gos\n" ); printf( "%f\n" , _gos[0]); printf( "%f\n" , gos[0]); return 0; } for (i = 0; i < NUM; i++) for (j = 0; j < NUM; j++) { if (fabs(_RES[i][j] - RES[i][j]) > TOLERANCE) { printf( "[%d, %d] =\n%f\n%f\n" , i, j, _RES[i][j], RES[i][j]); return 0; } } return 1; } int main( int argc, char **argv) { unsigned int second; unsigned int i, j; float p = 9.0; double _res, res; int nruns = 3; hpctimer_time_t t0, t1; double torig = 0.0, topt = 0.0; srand(time(NULL)); second = 1219304613 + (1000 - (2000.0 * ( double ) rand() / ( double ) (RAND_MAX + 1.0))); for (i = 0; i < ORDER; i++) { A[i] = ( float )i; } for (i = 0; i < NUM; i++) { for (j = 0; j < NUM; j++) { _RES[i][j] = 0.0; RES[i][j] = 0.0; } } /* Original code */ printf( "Executing original code ... " ); fflush(stdout); t0 = hpctimer_gettime(); for (i = 0; i < nruns; i++) { _function1(_RES, _GOS, second); _res = _function8(A, p); } t1 = hpctimer_gettime(); torig = hpctimer_getdiff(t0, t1) / nruns; printf( "done\n" ); fflush(stdout); /* Optimized code */ printf( "Executing optimized code ... " ); fflush(stdout); t0 = hpctimer_gettime(); for (i = 0; i < nruns; i++) { function1(RES, GOS, second); res = function8(A, p); } t1 = hpctimer_gettime(); topt = hpctimer_getdiff(t0, t1) / nruns; printf( "done\n" ); fflush(stdout); printf( "Checking results ... " ); fflush(stdout); if (!is_results_eq(_RES, _GOS, _res, RES, GOS, res)) { printf( "RESULTS ARE DIFFERENT!\n" ); } else { printf( "PASSED\n" ); printf( "Number of runs: %d\n" , nruns); printf( "Original code average time: %.6f sec.\n" , torig); printf( "Optimized code average time: %.6f sec.\n" , topt); printf( "Speedup: %.2f\n" , torig / topt); } return 0; } * This source code was highlighted with Source Code Highlighter .

Acceleration Rule One

It would be possible to say about optimization at the source code level: “It doesn’t matter what the code does — it’s important how it does it.” But the First Optimization Rule that I derived is somewhat contrary to this statement:

First Rule
Smoking materiel and algorithms

For when you first look at the code, it becomes clear that it performs some simple actions through the anus.
So, even a quick glance is enough to cut out exponent function with disgust, which significantly sets the number of useful gestures (by running some profiler, you can find that it eats 80% of the total counting time). The raising to the degree that she does in function8 is easily implemented dynamically.
In function8 itself, the calculation of the integral is subconsciously viewed, but we will discuss the scope for its optimization later.
The difference scheme used in function4 is not much optimized by rewriting the code, but, after carrying out the accumulation of gosa in a separate cycle, you can still speed it up.
The multiplication of matrices in function3 is accelerated in many different ways, but at the first stage it is sufficient to note that the multiplied matrices are absolutely identical, and to reduce the memory consumed in this function by two times. Or even google an elegant algorithm for squaring a matrix. I'm not talking about Strassen, Coppersmith Vinograd and other perverts.

Acceleration, Rule Two

I do not know if the author of the code deliberately crammed so much redundancy into it, but the main part of the resulting acceleration is due to the Second Rule:

Second Rule
Destroy data structures that do not carry meaning

The rule, by the way, is excellent, and it works not only for optimization, but also for readability, and for better compilability. Lyrically quoting one remarkable woman, "there is no point in using recursion where it does not make sense to use it."
In this sense, function4 is loaded downright indecently. Noticing with horror, four-dimensional arrays x, y and z are initialized, destroy them immediately. Wrk1 and bnd can follow. I'm not talking about such trifles as mimax, iCount, ss and other excesses. As a result, the code becomes much shorter and more aesthetic.
function6 and function7 are completely eliminated, the remnants of useful actions will not load function4 either logically or over time, therefore they are transferred to it without serious consequences. function5 suddenly self-destruct. In function3, as already mentioned, you can use the same source matrix, and call function2 once.
Already much easier to breathe, and the resulting speedup begins to inspire exploits.

Acceleration Rule Three

Third Rule
Minimize conversion prediction errors

Now I will explain everything. This is a branch optimization technique. The compiled loop or branch statement looks like this (very simplistic):

1 mov eax, ebx
2 jne $a
3 mov ecx, ebx
4 jmp b
5 a: ...
6 b: ...

Imagine what is happening at this time in the pipeline. Instructions are processed in several stages (Fetch - Decode - Execute - Write-back), and after loading instruction 2 the conversion prediction module should work, because we do not yet know the result of this operation and cannot choose the next one.
What happens when the transition prediction module "makes a mistake"? That's right - the pipeline is unloading. And this overhead. We can reduce the number of errors in the prediction of transitions by reducing the number of calls to it.
How?

place the most likely branches at the beginning of the branches
make higher (on the level of nesting in cycles) invariant branching
use loop unrolling

I will explain the last: if instead

for (i = 0; i < n; i++) { sum += p[i]; }

to write

for (i = 0; i < n; i+=4) { sum += p[i]; sum += p[i + 1]; sum += p[i + 2]; sum += p[i + 3]; }

we reduce the number of predictions of transitions by four times.

This technique has a pack of pitfalls, for example, the multiplicity of the number of iterations of the pieces to be unfolded, the optimal length of the unfolding, and others. But in each individual case, you can find a balance and grab off the extra percentages of acceleration in this relatively simple way.
An example of loop unfolding can be seen in the optimized version of function3.
It would also be nice to add the imposition of invariant branching from cycles, which are very blistering eyes in function2, but it’s not so easy to get them out of there. Who implements better than me - that young man.

To read: R. Gerber, A. Bill "Software Optimization: A Collection of Recipes . " 2010, Intel

Acceleration Rule Four

Fourth Rule
Effectively work with memory

In particular, with the cache. There is such a property of locality of references - the property of programs repeatedly / often refer to the same objects; and there is temporal localization and spatial localization. Caching is pre-empted, and on this basis it makes sense to use sequential addresses to avoid cache misses.
In the same matrix multiplication, the row is located in memory sequentially, and the column is scattered, and the known acceleration effect transposes the second matrix.
If we are already talking, let me give another example: if a structure (struct) does not fully fit into the cache, and contains an array within itself, it is better to use an array of structures instead of an array in the structure.
And yet, to write a portable program, do not bind to the size of the cache.
And many other subtleties. Read - Chris Kaspersky "Technique optimization programs. Efficient use of memory . 2003, BHV-Petersburg

Acceleration, Rule Fifth

What processor architecture do we have there?

Fifth Rule
Vectorize calculations where possible

It's time to use SSE, good, you can vectorize almost any code that works with arrays.
A small educational program.
SSE (x86) architecture:

MMX - processing integer vectors (64-bit registers)
SSE - work with real and integer vectors (128 bits)

Instructions can work with scalars as well as packed vectors. For example:

 mulss xmm1, xmm0 | mulps xmm1, xmm0 XMM0 4.0 3.0 2.0 1.0 | 4.0 3.0 2.0 1.0 * | * * * * XMM1 5.0 5.0 5.0 5.0 | 5.0 5.0 5.0 5.0 = | = = = = XMM1 4.0 3.0 2.0 5.0 | 20.0 15.0 10.0 5.0

SSE instructions can copy, arithmetic, compare, bitwise operations, and even any tricky transformations.

How to use them? There are assembly vector instructions, there are special C ++ classes, there is, after all, automatic vectorization — the most portable. I wanted to use intrinsiki.
Intrinsics is a set of functions and data types supported by the compiler to provide high-level access to SSE instructions.
Connect xmmintrin.h, write vectorized code, compile with -msse3 flag. The compiler distributes XMM # registers, schedules commands and addressing modes. Convenient, intuitively simple.
For example, add four float numbers to one instruction:

#include <xmmintrin.h> void add( float *a, float *b, float *c) { __mm128 t0, t1; t0 = _mm_load_ps(a); t1 = _mm_load_ps(b); t0 = _mm_add_ps(t0, t1); _mm_store_ps(c, t0); }

In this particular case, matrix multiplication is rewritten to intrinsiki (at the same time it becomes a block). function8 is in principle also vectorized, but here is a special case, so for now I’m looking for your patience.
At this stage, the code begins to dream, and each operator cut from the thick cycle gives a noticeable performance jump.

Acceleration Rule Six

Look at the boundary conditions. Replace data structures with simpler ones, where possible. To reduce calculations to problems for which solution there are fast algorithms.

Sixth Rule
Use mosk

In the driver, the results of matrix multiplication are compared to the accuracy of TOLERANCE, and the remaining two numbers are compared to the tenth. By the method of scientific typing, it turns out that in function8, two thirds of the interval is enough to achieve the required accuracy, and the first three million (all) iterations can be discarded. Moreover, it is enough to take into account every six to seven thousandth iteration.
Get

double function8( float *a, float p) { int i, j, val, step; double h, x, z; double s = p; h = 1.0 / VAL; val = 0.33 * VAL; step = val / 500; for (i = val; i <= VAL; i += step) { x = h * i; z = 1.0; for (j = 1; j < ORDER; j++) { z *= x; s += j * z; } } return s * h * step; }

and rejoice. Like? I do not. For you can generally get rid of the operation of exponentiation.

double function8( float *a, float p) { unsigned int j; double s = p / VAL; for (j = 0; j < ORDER; j++) { s += (a[j] / (j + 1)); } return s; }

It turns out the complexity of O (ORDER) - compared to the original O (VAL) * O (ORDER ^ 2). The same result can be achieved by using Bernoulli numbers (which are not so trivial, but you can still notice by writing out the algorithm manually).
You can pay attention to the features of filling the squared matrix and see the zeros in the even rows.
You can replace three-dimensional arrays on one-dimensional, addressing them like:
m1 = (YY - 1) * (ZZ - 1); m2 = (ZZ - 1); p [i * m1 + j * m2 + k]. This small trick allows you to use memset to populate multi-dimensional matrices.
You can and even need to use __inline for small functions.

And further. Since in this case, an eight-core machine is available, it is impossible not to use OpenMP. Parallelization of function3, function4 and function8 does not even require much imagination. A little less obvious is the possibility of using parallel sections in function1 (for function3 and function4 are independent of data).

For sweet - version of the optimized program. He does not pretend to the ideal, but he gives an acceleration honestly several hundred times.

/* * contestopt.c: */ #include <sys/time.h> #include <omp.h> #include "contest.h" void function3( double a[][NUM], double c[][NUM]); void function2( double a[][NUM]); double function4(unsigned int second); static double p[XX][YY][ZZ] __attribute__ ((aligned(64))); static double wrk2[XX][YY][ZZ] __attribute__ ((aligned(64))); static double a[NUM][NUM] __attribute__ ((aligned(64))); void function1( double result[][NUM], double gos[2], unsigned int second) { #pragma omp parallel sections { #pragma omp section function3(a,result); #pragma omp section gos[0] = function4(second); } } void function3( double a[][NUM], double result[][NUM]) { int i, i2, j, j2, k, k2; double *restrict rres; double *restrict rmul1; double *restrict rmul2; unsigned int BS = 8; int remainder = NUM % 8; int limit = NUM - remainder; function2(a); if (!(NUM & 1)) { #pragma omp for private (k, j, i, i2, j2, k2, rres, rmul1, rmul2) for (i = 0; i < limit; i += BS) for (j = 0; j < limit; j += BS) for (k = 0; k < limit; k += BS) for (i2 = 0, rres = &result[i][j], rmul1 = &a[i][k]; i2 < BS; ++i2, rres += NUM, rmul1 += NUM) { _mm_prefetch (&rmul1[8], _MM_HINT_NTA); if (!(NUM & 1)) for (k2 = 0, rmul2 = &a[k][j]; k2 < BS; ++k2, rmul2 += NUM) { __m128d m1d = _mm_load_sd (&rmul1[k2]); m1d = _mm_unpacklo_pd (m1d, m1d); j2 = 0; __m128d m2 = _mm_load_pd (&rmul2[j2]); __m128d r2 = _mm_load_pd (&rres[j2]); _mm_store_pd (&rres[j2], _mm_add_pd (_mm_mul_pd (m2, m1d), r2)); j2 +=2; m2 = _mm_load_pd (&rmul2[j2]); r2 = _mm_load_pd (&rres[j2]); _mm_store_pd (&rres[j2], _mm_add_pd (_mm_mul_pd (m2, m1d), r2)); j2 +=2; m2 = _mm_load_pd (&rmul2[j2]); r2 = _mm_load_pd (&rres[j2]); _mm_store_pd (&rres[j2], _mm_add_pd (_mm_mul_pd (m2, m1d), r2)); j2 +=2; m2 = _mm_load_pd (&rmul2[j2]); r2 = _mm_load_pd (&rres[j2]); _mm_store_pd (&rres[j2], _mm_add_pd (_mm_mul_pd (m2, m1d), r2)); } } } else { #pragma omp for private (k, j, i, i2, j2, k2, rres, rmul1, rmul2) for (i = 0; i < limit; i += BS) for (j = 0; j < limit; j += BS) for (k = 0; k < limit; k += BS) for (i2 = 0, rres = &result[i][j], rmul1 = &a[i][k]; i2 < BS; ++i2, rres += NUM, rmul1 += NUM) { _mm_prefetch (&rmul1[8], _MM_HINT_NTA); for (k2 = 0, rmul2 = &a[k][j]; k2 < BS; ++k2, rmul2 += NUM) { for (j2 = 0; j2 < BS; j2++ ) { rres[j2] += rmul1[k2] * rmul2[j2]; } } } } if (remainder) { #pragma omp for private (i,j,k) for (i = 0; i < limit; ++i) for (k = NUM - remainder; k < NUM; ++k) for (j = 0; j < limit; ++j) result[i][j] += a[i][k] * a[k][j]; #pragma omp for private (i,j,k) for (i = limit; i < NUM; ++i) for (k = 0; k < NUM; ++k) for (j = 0; j < NUM; ++j) result[i][j] += a[i][k] * a[k][j]; #pragma omp for private (i,j,k) for (i = 0; i < limit; ++i) for (k = 0; k < NUM; ++k) for (j = limit; j < NUM; ++j) result[i][j] += a[i][k] * a[k][j]; } } void function2( double a[][NUM]) { int i, j ; double first = 0.0; double second = 1.0; __assume_aligned(a, 64); a[0][0]=first; a[0][1]=second; for (j = 2; j < NUM; j++) { a[0][j] = first + second; first = second; second = a[0][j]; if ( j%20 == 0 && j != 0) { first = first * (j + 1) / (NUM); second = second * (j + 1) / (NUM); } } first = NUM / first; second = NUM / second; for (i = 1; i < NUM; i++) { for (j = 0; j < NUM; j++) { a[i][j] = first + second; first = second; second = a[i][j]; if ( j%20 == 0 && j != 0 ) { first = first * (j + 1) / (NUM); second = second * (j + 1) / (NUM); } } first = ((i + 1) * NUM) / first; second = ((i + 1) * NUM) / second; } } __inline void function6(unsigned int second) { int imax = XX - 1; int jmax = YY - 1; int kmax = ZZ - 1; int i, j, k; double kmaxDoubled = kmax * kmax; for (k = 0; k < kmax; k++) p[0][0][k] = (k * k) / kmaxDoubled; for (i = 0; i < imax; i++) for (j = 0; j < jmax; j++) for (k = 1; k < kmax; k++) p[i][j][k] = p[0][0][k]; } double function4(unsigned int second) { int nn = NN; double gosa = 0.0, val; int imax = XX - 2; int jmax = YY - 2; int kmax = ZZ - 2; int loop, i, j, k; function6(second); srand((unsigned int )second); val = ( double ) rand() / (10.0 * RAND_MAX); for (loop = 1; loop < nn; loop++) { for (k = 1; k < kmax; k++) for (j = 1; j < jmax; j++) for (i = 1; i < imax; i++) { wrk2[i][j][k] = 0.1 * ( p[i + 1][j][k] + p[i][j][k] + val * (p[i + 1][j + 1][k] - p[i + 1][j - 1][k] - p[i - 1][j + 1][k] + p[i - 1][j - 1][k] + p[i][j + 1][k + 1] - p[i][j - 1][k + 1] - p[i][j + 1][k - 1] + p[i][j - 1][k - 1] + p[i + 1][j][k + 1] - p[i - 1][j][k + 1] - p[i + 1][j][k - 1] + p[i - 1][j][k - 1]) + p[i - 1][j][k] + p[i][j - 1][k] + p[i][j][k - 1]); } for (i = 1; i < imax; i++) for (j = 1; j < jmax; j++) for (k = 1; k < kmax; k++) p[i][j][k] += wrk2[i][j][k]; } for (k = 1; k < kmax; k++) for (j = 1; j < jmax; j++) for (i = 1; i < imax; i++) { gosa += p[i + 1][j][k] + p[i][j][k] + val * (p[i + 1][j + 1][k] - p[i + 1][j - 1][k] - p[i - 1][j + 1][k] + p[i - 1][j - 1][k] + p[i][j + 1][k + 1] - p[i][j - 1][k + 1] - p[i][j + 1][k - 1] + p[i][j - 1][k - 1] + p[i + 1][j][k + 1] - p[i - 1][j][k + 1] - p[i + 1][j][k - 1] + p[i - 1][j][k - 1]) + p[i - 1][j][k] + p[i][j - 1][k] + p[i][j][k - 1]; } return gosa; } double function8( float *a, float p) { unsigned int j; double s = 0.0; #pragma omp parallel { #pragma omp for private (j) reduction (+:s) for (j = 0; j < ORDER; j++) { s += (a[j] /(j + 1)); } } return (s + p / VAL); } * This source code was highlighted with Source Code Highlighter .

Here is such a fun and very useful practical example. Let's summarize the code optimization scheme:

Algorithmic optimization
Architecturally independent optimization:
- I / O to parallel flow
- paralleling
- efficient memory management, efficient caching
Architectural-oriented optimization:
- vector instructions
- minimizing the number of conversion prediction errors
- library functions from processor manufacturers
- making calculations on a video card
- etc.

And that was still interesting. The given driver.c program contains a vulnerability, using which, you can get acceleration tens of thousands of times, especially without straining. Although it will already be an exploit.

Thanks for attention.

Source: https://habr.com/ru/post/111021/

All Articles