/*
* contestopt.c:
*/
#include "contest.h"
void function3( double a[][NUM], double b[][NUM], double c[][NUM]);
void function2( double a[][NUM]);
double function4(unsigned int second);
double ***p;
double ****x;
double ****y;
double ****z;
double ***wrk1;
double ***wrk2;
double ***bnd;
static double a[NUM][NUM], b[NUM][NUM];
void function1( double result[][NUM], double gos[2], unsigned int second)
{
function2(a);
function2(b);
function3(a, b, result);
gos[0] = function4(second);
}
void function3( double a[][NUM], double b[][NUM], double result[][NUM])
{
int i, j, k;
for (i = 0; i < NUM; i++) {
for (j = 0; j < NUM; j++) {
for (k = 0; k < NUM; k++) {
result[i][j] = result[i][j] + a[i][k] * b[k][j];
}
}
}
}
void function2( double a[][NUM])
{
int i, j;
double first = 0.0;
double second = 1.0;
a[0][0] = first;
a[0][1] = second;
for (i = 0; i < NUM; i++) {
for (j = 0; j < NUM; j++) {
if (!(i == 0 && (j == 0 || j == 1))) {
a[i][j] = first + second;
first = second;
second = a[i][j];
}
if (j % 20 == 0 && j != 0) {
first = first * (j + 1) / (NUM);
second = second * (j + 1) / (NUM);
}
}
first = ((i + 1) * NUM) / first;
second = ((i + 1) * NUM) / second;
}
}
void function5( double ***A, double val)
{
int i, j, k;
for (i = 0; i < XX; i++)
for (j = 0; j < YY; j++)
for (k = 0; k < ZZ; k++)
A[i][j][k] = val;
}
void function6(unsigned int second)
{
int mimax = XX;
int mjmax = YY;
int mkmax = ZZ;
int imax = mimax - 1;
int jmax = mjmax - 1;
int kmax = mkmax - 1;
int i, j, k, l;
double val;
srand((unsigned int ) second);
p = ( double ***) malloc( sizeof ( double **) * XX);
wrk1 = ( double ***) malloc( sizeof ( double **) * XX);
wrk2 = ( double ***) malloc( sizeof ( double **) * XX);
bnd = ( double ***) malloc( sizeof ( double **) * XX);
x = ( double ****) malloc( sizeof ( double ***) * XX);
y = ( double ****) malloc( sizeof ( double ***) * XX);
z = ( double ****) malloc( sizeof ( double ***) * XX);
for (i = 0; i < XX; i++) {
p[i] = ( double **) malloc( sizeof ( double *) * YY);
wrk1[i] = ( double **) malloc( sizeof ( double *) * YY);
wrk2[i] = ( double **) malloc( sizeof ( double *) * YY);
bnd[i] = ( double **) malloc( sizeof ( double *) * YY);
x[i] = ( double ***) malloc( sizeof ( double **) * YY);
y[i] = ( double ***) malloc( sizeof ( double **) * YY);
z[i] = ( double ***) malloc( sizeof ( double **) * YY);
for (j = 0; j < YY; j++) {
p[i][j] = ( double *) malloc( sizeof ( double ) * ZZ);
wrk1[i][j] = ( double *) malloc( sizeof ( double ) * ZZ);
wrk2[i][j] = ( double *) malloc( sizeof ( double ) * ZZ);
bnd[i][j] = ( double *) malloc( sizeof ( double ) * ZZ);
x[i][j] = ( double **) malloc( sizeof ( double *) * ZZ);
y[i][j] = ( double **) malloc( sizeof ( double *) * ZZ);
z[i][j] = ( double **) malloc( sizeof ( double *) * ZZ);
for (k = 0; k < ZZ; k++) {
x[i][j][k] = ( double *) malloc( sizeof ( double ) * 4);
y[i][j][k] = ( double *) malloc( sizeof ( double ) * 3);
z[i][j][k] = ( double *) malloc( sizeof ( double ) * 3);
}
}
}
val = ( double ) rand() / (10.0 * RAND_MAX);
for (i = 0; i < XX; i++)
for (j = 0; j < YY; j++)
for (k = 0; k < ZZ; k++) {
for (l = 0; l < 3; l++) {
x[i][j][k][l] = 1.0;
y[i][j][k][l] = val;
z[i][j][k][l] = 1.0;
}
x[i][j][k][3] = 1.0;
}
function5(bnd, 1.0);
function5(wrk1, 0.0);
function5(wrk2, 0.0);
for (i = 0; i < imax; i++)
for (j = 0; j < jmax; j++)
for (k = 0; k < kmax; k++)
p[i][j][k] = ( double ) (k * k) / ( double ) (kmax * kmax);
}
void function7()
{
int i, j, k;
for (i = 0; i < XX; i++) {
for (j = 0; j < YY; j++) {
free(p[i][j]);
free(wrk1[i][j]);
free(wrk2[i][j]);
free(bnd[i][j]);
for (k = 0; k < ZZ; k++) {
free(x[i][j][k]);
free(y[i][j][k]);
free(z[i][j][k]);
}
free(x[i][j]);
free(y[i][j]);
free(z[i][j]);
}
free(p[i]);
free(wrk1[i]);
free(wrk2[i]);
free(bnd[i]);
free(x[i]);
free(y[i]);
free(z[i]);
}
free(x);
free(y);
free(z);
free(p);
free(wrk1);
free(wrk2);
free(bnd);
}
double function4(unsigned int second)
{
int nn = NN;
double gosa, s0, ss;
int mimax = XX;
int mjmax = YY;
int mkmax = ZZ;
int imax = mimax - 1;
int jmax = mjmax - 1;
int kmax = mkmax - 1;
int iCount, jCount, kCount, loop, i, j, k;
function6(second);
for (loop = 0; loop < nn; loop++) {
gosa = 0.0;
for (k = 1; k < kmax - 1; k++)
for (j = 1; j < jmax - 1; j++)
for (i = 1; i < imax - 1; i++) {
s0 = x[i][j][k][0] * p[i + 1][j][k]
+ x[i][j][k][1] * p[i][j][k]
+ x[i][j][k][2] * p[i][j][k]
+ y[i][j][k][0] * (p[i + 1][j + 1][k] - p[i + 1][j - 1][k]
- p[i - 1][j + 1][k] + p[i - 1][j - 1][k])
+ y[i][j][k][1] * (p[i][j + 1][k + 1] - p[i][j - 1][k + 1]
- p[i][j + 1][k - 1] + p[i][j - 1][k - 1])
+ y[i][j][k][2] * (p[i + 1][j][k + 1] - p[i - 1][j][k + 1]
- p[i + 1][j][k - 1] + p[i - 1][j][k - 1])
+ z[i][j][k][0] * p[i - 1][j][k]
+ z[i][j][k][1] * p[i][j - 1][k]
+ z[i][j][k][2] * p[i][j][k - 1] + wrk1[i][j][k];
ss = (s0 * x[i][j][k][3] - p[i][j][k]) * bnd[i][j][k];
gosa = gosa + ss;
wrk2[i][j][k] = p[i][j][k] + 0.1 * ss;
}
for (iCount = 1; iCount < imax - 1; iCount++)
for (jCount = 1; jCount < jmax - 1; jCount++)
for (kCount = 1; kCount < kmax - 1; kCount++)
p[iCount][jCount][kCount] = wrk2[iCount][jCount][kCount];
}
function7();
return gosa;
}
double exponent( int i, double x)
{
double z = 1.0;
int j;
for (j = 0; j < i; j++) {
z = z * x;
}
return z;
}
double function8( float *a, float p)
{
int i, j;
double h, x;
double s = p;
h = 1.0 / VAL;
for (i = 1; i <= VAL; i++) {
x = h * (i - 0.5);
for (j = 0; j < ORDER; j++) {
s = (a[j] * exponent(j, x)) + s;
}
}
return s * h;
}
* This source code was highlighted with Source Code Highlighter .
/*
* driver.c: Contest driver.
*/
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>
#include "contest.h"
#include "hpctimer.h"
double _RES[NUM][NUM] = { 0.0 };
double _GOS[2] = { 0.0, 0.0 };
double RES[NUM][NUM] = { 0.0 };
double GOS[2] = { 0.0, 0.0 };
float A[ORDER] = { 0.0 };
double function8( float *a, float p);
void function1( double result[][NUM], double gos[2], unsigned int second);
int is_results_eq( double _res[][NUM], double _gos[2], float _p,
double res[][NUM], double gos[2], float p)
{
int i, j;
if (fabsf(_p - p) > 1E-1) {
printf( "_p p\n" );
printf( "%f\n" , _p);
printf( "%f\n" , p);
return 0;
}
if (fabs(_gos[0] - gos[0]) > 1E-1) {
printf( "_gos gos\n" );
printf( "%f\n" , _gos[0]);
printf( "%f\n" , gos[0]);
return 0;
}
for (i = 0; i < NUM; i++)
for (j = 0; j < NUM; j++) {
if (fabs(_RES[i][j] - RES[i][j]) > TOLERANCE) {
printf( "[%d, %d] =\n%f\n%f\n" , i, j, _RES[i][j], RES[i][j]);
return 0;
}
}
return 1;
}
int main( int argc, char **argv)
{
unsigned int second;
unsigned int i, j;
float p = 9.0;
double _res, res;
int nruns = 3;
hpctimer_time_t t0, t1;
double torig = 0.0, topt = 0.0;
srand(time(NULL));
second = 1219304613 + (1000 - (2000.0 * ( double ) rand() / ( double ) (RAND_MAX + 1.0)));
for (i = 0; i < ORDER; i++) {
A[i] = ( float )i;
}
for (i = 0; i < NUM; i++) {
for (j = 0; j < NUM; j++) {
_RES[i][j] = 0.0;
RES[i][j] = 0.0;
}
}
/* Original code */
printf( "Executing original code ... " );
fflush(stdout);
t0 = hpctimer_gettime();
for (i = 0; i < nruns; i++) {
_function1(_RES, _GOS, second);
_res = _function8(A, p);
}
t1 = hpctimer_gettime();
torig = hpctimer_getdiff(t0, t1) / nruns;
printf( "done\n" );
fflush(stdout);
/* Optimized code */
printf( "Executing optimized code ... " );
fflush(stdout);
t0 = hpctimer_gettime();
for (i = 0; i < nruns; i++) {
function1(RES, GOS, second);
res = function8(A, p);
}
t1 = hpctimer_gettime();
topt = hpctimer_getdiff(t0, t1) / nruns;
printf( "done\n" );
fflush(stdout);
printf( "Checking results ... " );
fflush(stdout);
if (!is_results_eq(_RES, _GOS, _res, RES, GOS, res)) {
printf( "RESULTS ARE DIFFERENT!\n" );
} else {
printf( "PASSED\n" );
printf( "Number of runs: %d\n" , nruns);
printf( "Original code average time: %.6f sec.\n" , torig);
printf( "Optimized code average time: %.6f sec.\n" , topt);
printf( "Speedup: %.2f\n" , torig / topt);
}
return 0;
}
* This source code was highlighted with Source Code Highlighter .
1 mov eax, ebx
2 jne $a
3 mov ecx, ebx
4 jmp b
5 a: ...
6 b: ...
for (i = 0; i < n; i++) {
sum += p[i];
}
for (i = 0; i < n; i+=4) {
sum += p[i];
sum += p[i + 1];
sum += p[i + 2];
sum += p[i + 3];
}
mulss xmm1, xmm0 | mulps xmm1, xmm0 XMM0 4.0 3.0 2.0 1.0 | 4.0 3.0 2.0 1.0 * | * * * * XMM1 5.0 5.0 5.0 5.0 | 5.0 5.0 5.0 5.0 = | = = = = XMM1 4.0 3.0 2.0 5.0 | 20.0 15.0 10.0 5.0
#include <xmmintrin.h>
void add( float *a, float *b, float *c)
{
__mm128 t0, t1;
t0 = _mm_load_ps(a);
t1 = _mm_load_ps(b);
t0 = _mm_add_ps(t0, t1);
_mm_store_ps(c, t0);
}
double function8( float *a, float p)
{
int i, j, val, step;
double h, x, z;
double s = p;
h = 1.0 / VAL;
val = 0.33 * VAL;
step = val / 500;
for (i = val; i <= VAL; i += step) {
x = h * i;
z = 1.0;
for (j = 1; j < ORDER; j++) {
z *= x;
s += j * z;
}
}
return s * h * step;
}
double function8( float *a, float p)
{
unsigned int j;
double s = p / VAL;
for (j = 0; j < ORDER; j++) {
s += (a[j] / (j + 1));
}
return s;
}
/*
* contestopt.c:
*/
#include <sys/time.h>
#include <omp.h>
#include "contest.h"
void function3( double a[][NUM], double c[][NUM]);
void function2( double a[][NUM]);
double function4(unsigned int second);
static double p[XX][YY][ZZ] __attribute__ ((aligned(64)));
static double wrk2[XX][YY][ZZ] __attribute__ ((aligned(64)));
static double a[NUM][NUM] __attribute__ ((aligned(64)));
void function1( double result[][NUM], double gos[2], unsigned int second)
{
#pragma omp parallel sections
{
#pragma omp section
function3(a,result);
#pragma omp section
gos[0] = function4(second);
}
}
void function3( double a[][NUM], double result[][NUM])
{
int i, i2, j, j2, k, k2;
double *restrict rres;
double *restrict rmul1;
double *restrict rmul2;
unsigned int BS = 8;
int remainder = NUM % 8;
int limit = NUM - remainder;
function2(a);
if (!(NUM & 1)) {
#pragma omp for private (k, j, i, i2, j2, k2, rres, rmul1, rmul2)
for (i = 0; i < limit; i += BS)
for (j = 0; j < limit; j += BS)
for (k = 0; k < limit; k += BS)
for (i2 = 0, rres = &result[i][j], rmul1 = &a[i][k]; i2 < BS;
++i2, rres += NUM, rmul1 += NUM) {
_mm_prefetch (&rmul1[8], _MM_HINT_NTA);
if (!(NUM & 1))
for (k2 = 0, rmul2 = &a[k][j]; k2 < BS; ++k2, rmul2 += NUM) {
__m128d m1d = _mm_load_sd (&rmul1[k2]);
m1d = _mm_unpacklo_pd (m1d, m1d);
j2 = 0;
__m128d m2 = _mm_load_pd (&rmul2[j2]);
__m128d r2 = _mm_load_pd (&rres[j2]);
_mm_store_pd (&rres[j2],
_mm_add_pd (_mm_mul_pd (m2, m1d), r2));
j2 +=2;
m2 = _mm_load_pd (&rmul2[j2]);
r2 = _mm_load_pd (&rres[j2]);
_mm_store_pd (&rres[j2],
_mm_add_pd (_mm_mul_pd (m2, m1d), r2));
j2 +=2;
m2 = _mm_load_pd (&rmul2[j2]);
r2 = _mm_load_pd (&rres[j2]);
_mm_store_pd (&rres[j2],
_mm_add_pd (_mm_mul_pd (m2, m1d), r2));
j2 +=2;
m2 = _mm_load_pd (&rmul2[j2]);
r2 = _mm_load_pd (&rres[j2]);
_mm_store_pd (&rres[j2],
_mm_add_pd (_mm_mul_pd (m2, m1d), r2));
}
}
} else {
#pragma omp for private (k, j, i, i2, j2, k2, rres, rmul1, rmul2)
for (i = 0; i < limit; i += BS)
for (j = 0; j < limit; j += BS)
for (k = 0; k < limit; k += BS)
for (i2 = 0, rres = &result[i][j], rmul1 = &a[i][k]; i2 < BS;
++i2, rres += NUM, rmul1 += NUM) {
_mm_prefetch (&rmul1[8], _MM_HINT_NTA);
for (k2 = 0, rmul2 = &a[k][j]; k2 < BS; ++k2, rmul2 += NUM) {
for (j2 = 0; j2 < BS; j2++ ) {
rres[j2] += rmul1[k2] * rmul2[j2];
}
}
}
}
if (remainder) {
#pragma omp for private (i,j,k)
for (i = 0; i < limit; ++i)
for (k = NUM - remainder; k < NUM; ++k)
for (j = 0; j < limit; ++j)
result[i][j] += a[i][k] * a[k][j];
#pragma omp for private (i,j,k)
for (i = limit; i < NUM; ++i)
for (k = 0; k < NUM; ++k)
for (j = 0; j < NUM; ++j)
result[i][j] += a[i][k] * a[k][j];
#pragma omp for private (i,j,k)
for (i = 0; i < limit; ++i)
for (k = 0; k < NUM; ++k)
for (j = limit; j < NUM; ++j)
result[i][j] += a[i][k] * a[k][j];
}
}
void function2( double a[][NUM])
{
int i, j ;
double first = 0.0;
double second = 1.0;
__assume_aligned(a, 64);
a[0][0]=first;
a[0][1]=second;
for (j = 2; j < NUM; j++) {
a[0][j] = first + second;
first = second;
second = a[0][j];
if ( j%20 == 0 && j != 0) {
first = first * (j + 1) / (NUM);
second = second * (j + 1) / (NUM);
}
}
first = NUM / first;
second = NUM / second;
for (i = 1; i < NUM; i++) {
for (j = 0; j < NUM; j++) {
a[i][j] = first + second;
first = second;
second = a[i][j];
if ( j%20 == 0 && j != 0 ) {
first = first * (j + 1) / (NUM);
second = second * (j + 1) / (NUM);
}
}
first = ((i + 1) * NUM) / first;
second = ((i + 1) * NUM) / second;
}
}
__inline void function6(unsigned int second)
{
int imax = XX - 1;
int jmax = YY - 1;
int kmax = ZZ - 1;
int i, j, k;
double kmaxDoubled = kmax * kmax;
for (k = 0; k < kmax; k++)
p[0][0][k] = (k * k) / kmaxDoubled;
for (i = 0; i < imax; i++)
for (j = 0; j < jmax; j++)
for (k = 1; k < kmax; k++)
p[i][j][k] = p[0][0][k];
}
double function4(unsigned int second)
{
int nn = NN;
double gosa = 0.0, val;
int imax = XX - 2;
int jmax = YY - 2;
int kmax = ZZ - 2;
int loop, i, j, k;
function6(second);
srand((unsigned int )second);
val = ( double ) rand() / (10.0 * RAND_MAX);
for (loop = 1; loop < nn; loop++) {
for (k = 1; k < kmax; k++)
for (j = 1; j < jmax; j++)
for (i = 1; i < imax; i++) {
wrk2[i][j][k] = 0.1 * ( p[i + 1][j][k] + p[i][j][k]
+ val * (p[i + 1][j + 1][k] - p[i + 1][j - 1][k]
- p[i - 1][j + 1][k] + p[i - 1][j - 1][k]
+ p[i][j + 1][k + 1] - p[i][j - 1][k + 1]
- p[i][j + 1][k - 1] + p[i][j - 1][k - 1]
+ p[i + 1][j][k + 1] - p[i - 1][j][k + 1]
- p[i + 1][j][k - 1] + p[i - 1][j][k - 1])
+ p[i - 1][j][k] + p[i][j - 1][k] + p[i][j][k - 1]);
}
for (i = 1; i < imax; i++)
for (j = 1; j < jmax; j++)
for (k = 1; k < kmax; k++) p[i][j][k] += wrk2[i][j][k];
}
for (k = 1; k < kmax; k++)
for (j = 1; j < jmax; j++)
for (i = 1; i < imax; i++) {
gosa += p[i + 1][j][k] + p[i][j][k]
+ val * (p[i + 1][j + 1][k] - p[i + 1][j - 1][k]
- p[i - 1][j + 1][k] + p[i - 1][j - 1][k]
+ p[i][j + 1][k + 1] - p[i][j - 1][k + 1]
- p[i][j + 1][k - 1] + p[i][j - 1][k - 1]
+ p[i + 1][j][k + 1] - p[i - 1][j][k + 1]
- p[i + 1][j][k - 1] + p[i - 1][j][k - 1])
+ p[i - 1][j][k] + p[i][j - 1][k] + p[i][j][k - 1];
}
return gosa;
}
double function8( float *a, float p)
{
unsigned int j;
double s = 0.0;
#pragma omp parallel
{
#pragma omp for private (j) reduction (+:s)
for (j = 0; j < ORDER; j++) {
s += (a[j] /(j + 1));
}
}
return (s + p / VAL);
}
* This source code was highlighted with Source Code Highlighter .
Source: https://habr.com/ru/post/111021/
All Articles