Actual source code: ex1.raja.cxx
1: //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2: // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
3: // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
4: //
5: // SPDX-License-Identifier: (BSD-3-Clause)
6: //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
8: #include <cstdlib>
9: #include <cstdio>
10: #include <cstring>
12: #include <iostream>
13: #include <cmath>
15: #include "RAJA/RAJA.hpp"
17: #include "memoryManager.hpp"
19: /*
20: * Jacobi Example
21: *
22: * ----[Details]--------------------
23: * This code uses a five point finite difference stencil
24: * to discretize the following boundary value problem
25: *
26: * U_xx + U_yy = f on [0,1] x [0,1].
27: *
28: * The right-hand side is chosen to be
29: * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
30: *
31: * A structured grid is used to discretize the domain
32: * [0,1] x [0,1]. Values inside the domain are computed
33: * using the Jacobi method to solve the associated
34: * linear system. The scheme is invoked until the l_2
35: * difference of subsequent iterations is below a
36: * tolerance.
37: *
38: * The scheme is implemented by allocating two arrays
39: * (I, Iold) and initialized to zero. The first set of
40: * nested for loops apply an iteration of the Jacobi
41: * scheme. The scheme is only applied to the interior
42: * nodes.
43: *
44: * The second set of nested for loops is used to
45: * update Iold and compute the l_2 norm of the
46: * difference of the iterates.
47: *
48: * Computing the l_2 norm requires a reduction operation.
49: * To simplify the reduction procedure, the RAJA API
50: * introduces thread safe variables.
51: *
52: * ----[RAJA Concepts]---------------
53: * - Forall::nested loop
54: * - RAJA Reduction
55: *
56: */
58: /*
59: * ----[Constant Values]-----
60: * CUDA_BLOCK_SIZE_X - Number of threads in the
61: * x-dimension of a cuda thread block
62: *
63: * CUDA_BLOCK_SIZE_Y - Number of threads in the
64: * y-dimension of a cuda thread block
65: *
66: * CUDA_BLOCK_SIZE - Number of threads per threads block
67: */
68: #if defined(RAJA_ENABLE_CUDA)
69: const int CUDA_BLOCK_SIZE = 256;
70: #endif
72: #if defined(RAJA_ENABLE_HIP)
73: const int HIP_BLOCK_SIZE = 256;
74: #endif
76: //
77: // Struct to hold grid info
78: // o - Origin in a cartesian dimension
79: // h - Spacing between grid points
80: // n - Number of grid points
81: //
82: struct grid_s {
83: double o, h;
84: int n;
85: };
87: //
88: // ----[Functions]---------
89: // solution - Function for the analytic solution
90: // computeErr - Displays the maximum error in the solution
91: //
92: double solution(double x, double y);
93: void computeErr(double *I, grid_s grid);
95: int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
96: {
98: std::cout<<"Jacobi Example"<<std::endl;
100: /*
101: * ----[Solver Parameters]------------
102: * tol - Method terminates once the norm is less than tol
103: * N - Number of unknown gridpoints per cartesian dimension
104: * NN - Total number of gridpoints on the grid
105: * maxIter - Maximum number of iterations to be taken
106: *
107: * resI2 - Residual
108: * iteration - Iteration number
109: * grid_s - Struct with grid information for a cartesian dimension
110: */
111: double tol = 1e-10;
113: int N = 50;
114: int NN = (N + 2) * (N + 2);
115: int maxIter = 100000;
117: double resI2;
118: int iteration;
120: grid_s gridx;
121: gridx.o = 0.0;
122: gridx.h = 1.0 / (N + 1.0);
123: gridx.n = N + 2;
125: //
126: //I, Iold - Holds iterates of Jacobi method
127: //
128: double *I = memoryManager::allocate<double>(NN);
129: double *Iold = memoryManager::allocate<double>(NN);
131: memset(I, 0, NN * sizeof(double));
132: memset(Iold, 0, NN * sizeof(double));
134: printf("Standard C++ Loop \n");
135: resI2 = 1;
136: iteration = 0;
138: while (resI2 > tol * tol) {
140: //
141: // Jacobi Iteration
142: //
143: for (int n = 1; n <= N; ++n) {
144: for (int m = 1; m <= N; ++m) {
146: double x = gridx.o + m * gridx.h;
147: double y = gridx.o + n * gridx.h;
149: double f = gridx.h * gridx.h
150: * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
152: int id = n * (N + 2) + m;
153: I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
154: + Iold[id + 1]);
155: }
156: }
158: //
159: // Compute residual and update Iold
160: //
161: resI2 = 0.0;
162: for (int k = 0; k < NN; k++) {
163: resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
164: Iold[k] = I[k];
165: }
167: if (iteration > maxIter) {
168: printf("Standard C++ Loop - Maxed out on iterations \n");
169: exit(-1);
170: }
172: iteration++;
173: }
174: computeErr(I, gridx);
175: printf("No of iterations: %d \n \n", iteration);
177: //
178: // RAJA loop calls may be shortened by predefining policies
179: //
180: RAJA::RangeSegment gridRange(0, NN);
181: RAJA::RangeSegment jacobiRange(1, (N + 1));
183: using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
184: RAJA::statement::For<1, RAJA::seq_exec,
185: RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;
187: printf("RAJA: Sequential Policy - Nested ForallN \n");
188: resI2 = 1;
189: iteration = 0;
190: memset(I, 0, NN * sizeof(double));
191: memset(Iold, 0, NN * sizeof(double));
193: /*
194: * Sequential Jacobi Iteration.
195: *
196: * Note that a RAJA ReduceSum object is used to accumulate the sum
197: * for the residual. Since the loop is run sequentially, this is
198: * not strictly necessary. It is done here for consistency and
199: * comparison with other RAJA variants in this example.
200: */
201: while (resI2 > tol * tol) {
203: RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
204: [=] (RAJA::Index_type m, RAJA::Index_type n) {
206: double x = gridx.o + m * gridx.h;
207: double y = gridx.o + n * gridx.h;
209: double f = gridx.h * gridx.h
210: * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
212: int id = n * (N + 2) + m;
213: I[id] =
214: 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
215: + Iold[id + 1]);
216: });
218: RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
219: RAJA::forall<RAJA::seq_exec>(
220: gridRange, [=](RAJA::Index_type k) {
222: RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
223: Iold[k] = I[k];
225: });
227: resI2 = RAJA_resI2;
228: if (iteration > maxIter) {
229: printf("Jacobi: Sequential - Maxed out on iterations! \n");
230: exit(-1);
231: }
232: iteration++;
233: }
234: computeErr(I, gridx);
235: printf("No of iterations: %d \n \n", iteration);
237: #if defined(RAJA_ENABLE_OPENMP)
238: printf("RAJA: OpenMP Policy - Nested ForallN \n");
239: resI2 = 1;
240: iteration = 0;
241: memset(I, 0, NN * sizeof(double));
242: memset(Iold, 0, NN * sizeof(double));
244: /*
245: * OpenMP parallel Jacobi Iteration.
246: *
247: * ----[RAJA Policies]-----------
248: * RAJA::omp_collapse_for_exec -
249: * introduced a nested region
250: *
251: * Note that OpenMP RAJA ReduceSum object performs the reduction
252: * operation for the residual in a thread-safe manner.
253: */
255: using jacobiOmpNestedPolicy = RAJA::KernelPolicy<
256: RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
257: RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
259: while (resI2 > tol * tol) {
261: RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
262: [=] (RAJA::Index_type m, RAJA::Index_type n) {
264: double x = gridx.o + m * gridx.h;
265: double y = gridx.o + n * gridx.h;
267: double f = gridx.h * gridx.h *
268: (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
270: int id = n * (N + 2) + m;
271: I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
272: Iold[id - 1] + Iold[id + 1]);
273: });
275: RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
277: RAJA::forall<RAJA::omp_parallel_for_exec>( gridRange,
278: [=](RAJA::Index_type k) {
280: RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
281: Iold[k] = I[k];
283: });
285: resI2 = RAJA_resI2;
286: if (iteration > maxIter) {
287: printf("Jacobi: OpenMP - Maxed out on iterations! \n");
288: exit(-1);
289: }
290: iteration++;
291: }
292: computeErr(I, gridx);
293: printf("No of iterations: %d \n \n", iteration);
294: #endif
296: #if defined(RAJA_ENABLE_CUDA)
297: /*
298: * CUDA Jacobi Iteration.
299: *
300: * ----[RAJA Policies]-----------
301: * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
302: * define the mapping of loop iterations to GPU thread blocks
303: *
304: * Note that CUDA RAJA ReduceSum object performs the reduction
305: * operation for the residual in a thread-safe manner on the GPU.
306: */
308: printf("RAJA: CUDA Policy - Nested ForallN \n");
310: using jacobiCUDANestedPolicy = RAJA::KernelPolicy<
311: RAJA::statement::CudaKernel<
312: RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop,
313: RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
314: RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
315: RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
316: RAJA::statement::Lambda<0>
317: >
318: >
319: >
320: >
321: > >;
323: resI2 = 1;
324: iteration = 0;
325: memset(I, 0, NN * sizeof(double));
326: memset(Iold, 0, NN * sizeof(double));
328: while (resI2 > tol * tol) {
330: //
331: // Jacobi Iteration
332: //
333: RAJA::kernel<jacobiCUDANestedPolicy>(
334: RAJA::make_tuple(jacobiRange,jacobiRange),
335: [=] RAJA_DEVICE (RAJA::Index_type m, RAJA::Index_type n) {
337: double x = gridx.o + m * gridx.h;
338: double y = gridx.o + n * gridx.h;
340: double f = gridx.h * gridx.h
341: * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
343: int id = n * (N + 2) + m;
344: I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
345: + Iold[id + 1]);
346: });
348: //
349: // Compute residual and update Iold
350: //
351: RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
352: RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
353: gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {
355: RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
356: Iold[k] = I[k];
358: });
360: resI2 = RAJA_resI2;
362: if (iteration > maxIter) {
363: printf("RAJA: CUDA - Maxed out on iterations! \n");
364: exit(-1);
365: }
366: iteration++;
367: }
368: cudaDeviceSynchronize();
369: computeErr(I, gridx);
370: printf("No of iterations: %d \n \n", iteration);
371: #endif
373: #if defined(RAJA_ENABLE_HIP)
374: /*
375: * HIP Jacobi Iteration.
376: *
377: * ----[RAJA Policies]-----------
378: * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
379: * define the mapping of loop iterations to GPU thread blocks
380: *
381: * Note that HIP RAJA ReduceSum object performs the reduction
382: * operation for the residual in a thread-safe manner on the GPU.
383: */
385: printf("RAJA: HIP Policy - Nested ForallN \n");
387: using jacobiHIPNestedPolicy = RAJA::KernelPolicy<
388: RAJA::statement::HipKernel<
389: RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop,
390: RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
391: RAJA::statement::For<1, RAJA::hip_thread_y_direct,
392: RAJA::statement::For<0, RAJA::hip_thread_x_direct,
393: RAJA::statement::Lambda<0>
394: >
395: >
396: >
397: >
398: > >;
400: resI2 = 1;
401: iteration = 0;
402: memset(I, 0, NN * sizeof(double));
403: memset(Iold, 0, NN * sizeof(double));
405: double *d_I = memoryManager::allocate_gpu<double>(NN);
406: double *d_Iold = memoryManager::allocate_gpu<double>(NN);
407: hipErrchk(hipMemcpy( d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
408: hipErrchk(hipMemcpy( d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
410: while (resI2 > tol * tol) {
412: //
413: // Jacobi Iteration
414: //
415: RAJA::kernel<jacobiHIPNestedPolicy>(
416: RAJA::make_tuple(jacobiRange,jacobiRange),
417: [=] RAJA_DEVICE (RAJA::Index_type m, RAJA::Index_type n) {
419: double x = gridx.o + m * gridx.h;
420: double y = gridx.o + n * gridx.h;
422: double f = gridx.h * gridx.h
423: * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
425: int id = n * (N + 2) + m;
426: d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1]
427: + d_Iold[id + 1]);
428: });
430: //
431: // Compute residual and update Iold
432: //
433: RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
434: RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
435: gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {
437: RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
438: d_Iold[k] = d_I[k];
440: });
442: resI2 = RAJA_resI2;
444: if (iteration > maxIter) {
445: printf("RAJA: HIP - Maxed out on iterations! \n");
446: exit(-1);
447: }
448: iteration++;
449: }
450: hipDeviceSynchronize();
451: hipErrchk(hipMemcpy( I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
452: computeErr(I, gridx);
453: printf("No of iterations: %d \n \n", iteration);
455: memoryManager::deallocate_gpu(d_I);
456: memoryManager::deallocate_gpu(d_Iold);
457: #endif
459: memoryManager::deallocate(I);
460: memoryManager::deallocate(Iold);
462: return 0;
463: }
465: //
466: // Function for the anlytic solution
467: //
468: double solution(double x, double y)
469: {
470: return x * y * exp(x - y) * (1 - x) * (1 - y);
471: }
473: //
474: // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
475: //
476: void computeErr(double *I, grid_s grid)
477: {
479: RAJA::RangeSegment gridRange(0, grid.n);
480: RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
482: using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
483: RAJA::statement::For<1, RAJA::seq_exec,
484: RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
486: RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange,gridRange),
487: [=] (RAJA::Index_type ty, RAJA::Index_type tx) {
489: int id = tx + grid.n * ty;
490: double x = grid.o + tx * grid.h;
491: double y = grid.o + ty * grid.h;
492: double myErr = std::abs(I[id] - solution(x, y));
493: tMax.max(myErr);
494: });
496: double l2err = tMax;
497: printf("Max error = %lg, h = %f \n", l2err, grid.h);
498: }
500: /*TEST
502: test:
503: requires: raja !cuda
505: test:
506: suffix: 2
507: requires: raja cuda
509: TEST*/