void Algorithm_1(const uint32_t N, const std::vector<uint64_t>& powers, const uint32_t from, const uint32_t to) { for (uint32_t a = from; a < to; ++a) { for (uint32_t b = 1; b < a; ++b) { for (uint32_t c = 1; c < b; ++c) { for (uint32_t d = 1; d < c; ++d) { const uint64_t sum = powers[a] + powers[b] + powers[c] + powers[d]; if (std::binary_search(std::begin(powers), std::end(powers), sum)) { const auto it = std::lower_bound(std::begin(powers), std::end(powers), sum); uint32_t e = static_cast<uint32_t>(std::distance(std::begin(powers), it)); std::cout << a << " " << b << " " << c << " " << d << " " << e << "\n"; } } } } } }
int blocks_x = N; int threads = std::min(1024, static_cast<int>(N)); int blocks_y = (N + threads - 1) / threads; dim3 grid(blocks_x, blocks_y); NaiveGPUKernel<<<grid, threads>>>(N);
const int a = blockIdx.x + 1; const int b = threadIdx.x + blockIdx.y * blockDim.x + 1;
__constant__ uint64_t gpu_powers[8192]; inline __device__ int gpu_binary_search(const uint32_t N, const uint64_t search) { uint32_t l = 0; uint32_t r = elements_count - 1; uint32_t m; while (l <= r) { m = (l + r) / 2; if (search < gpu_powers[m]) r = m - 1; else if (search > gpu_powers[m]) l = m + 1; else return l; } return -1; } __global__ void NaiveGPUKernel(const uint32_t N) { const int a = blockIdx.x + 1; const int b = threadIdx.x + blockIdx.y * blockDim.x + 1; if (b >= a) return; for (uint32_t c = 1; c < b; ++c) { for (uint32_t d = 1; d < c; ++d) { const uint64_t sum = gpu_powers[a] + gpu_powers[b] + gpu_powers[c] + gpu_powers[d]; const auto s = gpu_binary_search(N, sum); if (s > 0) { printf("%d %d %d %d %d\n", a, b, c, d, s); } } } }
N | CPU time, ms | CPU (4 threads) time, ms | GPU time, ms | |
---|---|---|---|---|
@antonkrechetov | 100 | 58.6 | 16.7 | 14.8 |
Base | 100 | 45.3 | 13.0 | 10.7 |
Basic optimization # 1 | 100 | 6.3 | 2.1 | 12.7 |
Basic Optimization # 2 | 100 | 1.4 | 0.7 | 0.8 |
@antonkrechetov | 250 | 2999.7 | 782.9 | 119.0 |
Base | 250 | 2055.6 | 550.1 | 90.9 |
Basic optimization # 1 | 250 | 227.2 | 60.3 | 109.2 |
Basic Optimization # 2 | 250 | 42.9 | 11.9 | 6.0 |
@antonkrechetov | 500 | 72034.2 | 19344.1 | 1725.83 |
Base | 500 | 38219.7 | 10200.8 | 976.7 |
Basic optimization # 1 | 500 | 3725.1 | 926.5 | 1140.36 |
Basic Optimization # 2 | 500 | 630.7 | 170.2 | 48.7 |
@antonkrechetov | 750 | 396566.0 | 105003.0 | 11521.2 |
Base | 750 | 218615.0 | 57639.2 | 5742.5 |
Basic optimization # 1 | 750 | 19082.7 | 4736.8 | 6402.1 |
Basic Optimization # 2 | 750 | 3272.0 | 846.9 | 222.9 |
Basic Optimization # 2 | 1000 | 10204.4 | 2730.3 | 1041.9 |
Basic Optimization # 2 | 1250 | 25133.1 | 6515.3 | 2445.5 |
Basic Optimization # 2 | 1500 | 51940.1 | 14005.0 | 4895.2 |
N | CPU time, ms | CPU (4 threads) time, ms | CPU + PGO time, ms | CPU + PGO (4 threads) time, ms | |
---|---|---|---|---|---|
@antonkrechetov | 100 | 58.6 | 16.7 | 55.3 | 15.0 |
Base | 100 | 45.3 | 13.0 | 42.2 | 12.1 |
Basic optimization # 1 | 100 | 6.3 | 2.1 | 5.2 | 1.9 |
Basic Optimization # 2 | 100 | 1.4 | 0.7 | 1.3 | 0.8 |
@antonkrechetov | 250 | 2999.7 | 782.9 | 2966.1 | 774.1 |
Base | 250 | 2055.6 | 550.1 | 2050.2 | 544.6 |
Basic optimization # 1 | 250 | 227.2 | 60.3 | 200.0 | 53.2 |
Basic Optimization # 2 | 250 | 42.9 | 11.9 | 40.4 | 11.4 |
@antonkrechetov | 500 | 72034.2 | 19344.1 | 68662.8 | 17959.0 |
Base | 500 | 38219.7 | 10200.8 | 38077.7 | 10034.0 |
Basic optimization # 1 | 500 | 3725.1 | 926.5 | 3132.9 | 822.2 |
Basic Optimization # 2 | 500 | 630.7 | 170.2 | 618.1 | 160.6 |
@antonkrechetov | 750 | 396566.0 | 105003.0 | 404692.0 | 103602.0 |
Base | 750 | 218615.0 | 57639.2 | 207975.0 | 54868.2 |
Basic optimization # 1 | 750 | 19082.7 | 4736.8 | 15496.4 | 4082.3 |
Basic Optimization # 2 | 750 | 3272.0 | 846.9 | 3093.8 | 812.7 |
Basic Optimization # 2 | 1000 | 10204.4 | 2730.3 | 9781.6 | 2565.9 |
Basic Optimization # 2 | 1250 | 25133.1 | 6515.3 | 23704.3 | 6244.1 |
Basic Optimization # 2 | 1500 | 51940.1 | 14005.0 | 48717.5 | 12793.5 |
__global__ void NaiveGPUKernel(const uint32_t N) { const int a = blockIdx.x + 1; const int b = threadIdx.x + blockIdx.y * blockDim.x + 1; if (b >= a) return; for (uint32_t c = 1; c < b; ++c) { for (uint32_t d = 1; d < c; ++d) { const uint64_t sum = gpu_powers[a] + gpu_powers[b] + gpu_powers[c] + gpu_powers[d]; const auto s = gpu_binary_search(N, sum); if (s > 0) { printf("%d %d %d %d %d\n", a, b, c, d, s); return; <- } } } }
Source: https://habr.com/ru/post/318066/
All Articles