Actual source code: sfcuda.cu

  1: #include <../src/vec/is/sf/impls/basic/sfpack.h>

  3: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
  4: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
  5: {
  6:   PetscInt        i,j,k,m,n,r;
  7:   const PetscInt  *offset,*start,*dx,*dy,*X,*Y;

  9:   n      = opt[0];
 10:   offset = opt + 1;
 11:   start  = opt + n + 2;
 12:   dx     = opt + 2*n + 2;
 13:   dy     = opt + 3*n + 2;
 14:   X      = opt + 5*n + 2;
 15:   Y      = opt + 6*n + 2;
 16:   for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
 17:   m = (tid - offset[r]);
 18:   k = m/(dx[r]*dy[r]);
 19:   j = (m - k*dx[r]*dy[r])/dx[r];
 20:   i = m - k*dx[r]*dy[r] - j*dx[r];

 22:   return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
 23: }

 25: /*====================================================================================*/
 26: /*  Templated CUDA kernels for pack/unpack. The Op can be regular or atomic           */
 27: /*====================================================================================*/

 29: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
 30:    <Type> is PetscReal, which is the primitive type we operate on.
 31:    <bs>   is 16, which says <unit> contains 16 primitive types.
 32:    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
 33:    <EQ>   is 0, which is (bs == BS ? 1 : 0)

 35:   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
 36:   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
 37: */
 38: template<class Type,PetscInt BS,PetscInt EQ>
 39: __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
 40: {
 41:   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 42:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 43:   const PetscInt  M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
 44:   const PetscInt  MBS = M*BS;  /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */

 46:   for (; tid<count; tid += grid_size) {
 47:     /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
 48:        opt == NULL && idx == NULL ==> the indices are contiguous;
 49:      */
 50:     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
 51:     s = tid*MBS;
 52:     for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
 53:   }
 54: }

 56: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 57: __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
 58: {
 59:   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 60:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 61:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 62:   Op              op;

 64:   for (; tid<count; tid += grid_size) {
 65:     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
 66:     s = tid*MBS;
 67:     for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
 68:   }
 69: }

 71: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 72: __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
 73: {
 74:   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
 75:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 76:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 77:   Op              op;

 79:   for (; tid<count; tid += grid_size) {
 80:     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
 81:     l = tid*MBS;
 82:     for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
 83:   }
 84: }

 86: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 87: __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
 88: {
 89:   PetscInt        i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 90:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 91:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 92:   Op              op;

 94:   for (; tid<count; tid += grid_size) {
 95:     if (!srcIdx) { /* src is either contiguous or 3D */
 96:       k = tid/(srcx*srcy);
 97:       j = (tid - k*srcx*srcy)/srcx;
 98:       i = tid - k*srcx*srcy - j*srcx;
 99:       s = srcStart + k*srcX*srcY + j*srcX + i;
100:     } else {
101:       s = srcIdx[tid];
102:     }

104:     if (!dstIdx) { /* dst is either contiguous or 3D */
105:       k = tid/(dstx*dsty);
106:       j = (tid - k*dstx*dsty)/dstx;
107:       i = tid - k*dstx*dsty - j*dstx;
108:       t = dstStart + k*dstX*dstY + j*dstX + i;
109:     } else {
110:       t = dstIdx[tid];
111:     }

113:     s *= MBS;
114:     t *= MBS;
115:     for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
116:   }
117: }

119: template<class Type,class Op,PetscInt BS,PetscInt EQ>
120: __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
121: {
122:   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
123:   const PetscInt  grid_size = gridDim.x * blockDim.x;
124:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
125:   Op              op;

127:   for (; tid<count; tid += grid_size) {
128:     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
129:     l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
130:     for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
131:   }
132: }

134: /*====================================================================================*/
135: /*                             Regular operations on device                           */
136: /*====================================================================================*/
137: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = y;             return old;}};
138: template<typename Type> struct Add    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y;             return old;}};
139: template<typename Type> struct Mult   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y;             return old;}};
140: template<typename Type> struct Min    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMin(x,y); return old;}};
141: template<typename Type> struct Max    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMax(x,y); return old;}};
142: template<typename Type> struct LAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x && y;        return old;}};
143: template<typename Type> struct LOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x || y;        return old;}};
144: template<typename Type> struct LXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = !x != !y;      return old;}};
145: template<typename Type> struct BAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x & y;         return old;}};
146: template<typename Type> struct BOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x | y;         return old;}};
147: template<typename Type> struct BXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x ^ y;         return old;}};
148: template<typename Type> struct Minloc {
149:   __device__ Type operator() (Type& x,Type y) const {
150:     Type old = x;
151:     if (y.a < x.a) x = y;
152:     else if (y.a == x.a) x.b = min(x.b,y.b);
153:     return old;
154:   }
155: };
156: template<typename Type> struct Maxloc {
157:   __device__ Type operator() (Type& x,Type y) const {
158:     Type old = x;
159:     if (y.a > x.a) x = y;
160:     else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
161:     return old;
162:   }
163: };

165: /*====================================================================================*/
166: /*                             Atomic operations on device                            */
167: /*====================================================================================*/

169: /*
170:   Atomic Insert (exchange) operations

172:   CUDA C Programming Guide V10.1 Chapter B.12.1.3:

174:   int atomicExch(int* address, int val);
175:   unsigned int atomicExch(unsigned int* address, unsigned int val);
176:   unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
177:   float atomicExch(float* address, float val);

179:   reads the 32-bit or 64-bit word old located at the address address in global or shared
180:   memory and stores val back to memory at the same address. These two operations are
181:   performed in one atomic transaction. The function returns old.

183:   PETSc notes:

185:   It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.

187:   VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
188:   atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
189:   be atomic itself.

191:   With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
192:   insertion is atomic. Hope no user codes rely on that.
193: */
194: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((ullint*)address,__double_as_longlong(val)));}

196: __device__ static llint atomicExch(llint* address,llint val) {return (llint)(atomicExch((ullint*)address,(ullint)val));}

198: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};

200: #if defined(PETSC_HAVE_COMPLEX)
201: #if defined(PETSC_USE_REAL_DOUBLE)
202: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
203: template<> struct AtomicInsert<PetscComplex> {
204:   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
205:     PetscComplex         old, *z = &old;
206:     double               *xp = (double*)&x,*yp = (double*)&y;
207:     AtomicInsert<double> op;
208:     z[0] = op(xp[0],yp[0]);
209:     z[1] = op(xp[1],yp[1]);
210:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
211:   }
212: };
213: #elif defined(PETSC_USE_REAL_SINGLE)
214: template<> struct AtomicInsert<PetscComplex> {
215:   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
216:     double               *xp = (double*)&x,*yp = (double*)&y;
217:     AtomicInsert<double> op;
218:     return op(xp[0],yp[0]);
219:   }
220: };
221: #endif
222: #endif

224: /*
225:   Atomic add operations

227:   CUDA C Programming Guide V10.1 Chapter B.12.1.1:

229:   int atomicAdd(int* address, int val);
230:   unsigned int atomicAdd(unsigned int* address,unsigned int val);
231:   unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
232:   float atomicAdd(float* address, float val);
233:   double atomicAdd(double* address, double val);
234:   __half2 atomicAdd(__half2 *address, __half2 val);
235:   __half atomicAdd(__half *address, __half val);

237:   reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
238:   and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
239:   function returns old.

241:   The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
242:   The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
243:   The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
244:   higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
245:   the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
246:   The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
247: */
248: __device__ static llint atomicAdd(llint* address,llint val) {return (llint)atomicAdd((ullint*)address,(ullint)val);}

250: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};

252: template<> struct AtomicAdd<double> {
253:   __device__ double operator() (double& x,double y) const {
254: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
255:     return atomicAdd(&x,y);
256: #else
257:     double *address = &x, val = y;
258:     ullint *address_as_ull = (ullint*)address;
259:     ullint old = *address_as_ull, assumed;
260:     do {
261:       assumed = old;
262:       old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
263:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
264:     } while (assumed != old);
265:     return __longlong_as_double(old);
266: #endif
267:   }
268: };

270: template<> struct AtomicAdd<float> {
271:   __device__ float operator() (float& x,float y) const {
272: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
273:     return atomicAdd(&x,y);
274: #else
275:     float *address = &x, val = y;
276:     int   *address_as_int = (int*)address;
277:     int   old = *address_as_int, assumed;
278:     do {
279:       assumed = old;
280:       old     = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
281:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
282:     } while (assumed != old);
283:     return __int_as_float(old);
284: #endif
285:   }
286: };

288: #if defined(PETSC_HAVE_COMPLEX)
289: template<> struct AtomicAdd<PetscComplex> {
290:  __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
291:   PetscComplex         old, *z = &old;
292:   PetscReal            *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
293:   AtomicAdd<PetscReal> op;
294:   z[0] = op(xp[0],yp[0]);
295:   z[1] = op(xp[1],yp[1]);
296:   return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
297:  }
298: };
299: #endif

301: /*
302:   Atomic Mult operations:

304:   CUDA has no atomicMult at all, so we build our own with atomicCAS
305:  */
306: #if defined(PETSC_USE_REAL_DOUBLE)
307: __device__ static double atomicMult(double* address, double val)
308: {
309:   ullint *address_as_ull = (ullint*)(address);
310:   ullint old = *address_as_ull, assumed;
311:   do {
312:     assumed = old;
313:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
314:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
315:   } while (assumed != old);
316:   return __longlong_as_double(old);
317: }
318: #elif defined(PETSC_USE_REAL_SINGLE)
319: __device__ static float atomicMult(float* address,float val)
320: {
321:   int *address_as_int = (int*)(address);
322:   int old = *address_as_int, assumed;
323:   do {
324:     assumed  = old;
325:     old      = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
326:   } while (assumed != old);
327:   return __int_as_float(old);
328: }
329: #endif

331: __device__ static int atomicMult(int* address,int val)
332: {
333:   int *address_as_int = (int*)(address);
334:   int old = *address_as_int, assumed;
335:   do {
336:     assumed = old;
337:     old     = atomicCAS(address_as_int, assumed, val*assumed);
338:   } while (assumed != old);
339:   return (int)old;
340: }

342: __device__ static llint atomicMult(llint* address,llint val)
343: {
344:   ullint *address_as_ull = (ullint*)(address);
345:   ullint old = *address_as_ull, assumed;
346:   do {
347:     assumed = old;
348:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val*(llint)assumed));
349:   } while (assumed != old);
350:   return (llint)old;
351: }

353: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};

355: /*
356:   Atomic Min/Max operations

358:   CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:

360:   int atomicMin(int* address, int val);
361:   unsigned int atomicMin(unsigned int* address,unsigned int val);
362:   unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);

364:   reads the 32-bit or 64-bit word old located at the address address in global or shared
365:   memory, computes the minimum of old and val, and stores the result back to memory
366:   at the same address. These three operations are performed in one atomic transaction.
367:   The function returns old.
368:   The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.

370:   atomicMax() is similar.
371:  */

373: #if defined(PETSC_USE_REAL_DOUBLE)
374: __device__ static double atomicMin(double* address, double val)
375: {
376:   ullint *address_as_ull = (ullint*)(address);
377:   ullint old = *address_as_ull, assumed;
378:   do {
379:     assumed = old;
380:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
381:   } while (assumed != old);
382:   return __longlong_as_double(old);
383: }

385: __device__ static double atomicMax(double* address, double val)
386: {
387:   ullint *address_as_ull = (ullint*)(address);
388:   ullint old = *address_as_ull, assumed;
389:   do {
390:     assumed  = old;
391:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
392:   } while (assumed != old);
393:   return __longlong_as_double(old);
394: }
395: #elif defined(PETSC_USE_REAL_SINGLE)
396: __device__ static float atomicMin(float* address,float val)
397: {
398:   int *address_as_int = (int*)(address);
399:   int old = *address_as_int, assumed;
400:   do {
401:     assumed = old;
402:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
403:   } while (assumed != old);
404:   return __int_as_float(old);
405: }

407: __device__ static float atomicMax(float* address,float val)
408: {
409:   int *address_as_int = (int*)(address);
410:   int old = *address_as_int, assumed;
411:   do {
412:     assumed = old;
413:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
414:   } while (assumed != old);
415:   return __int_as_float(old);
416: }
417: #endif

419: /*
420:   atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
421:   atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
422:   This causes compilation errors with pgi compilers and 64-bit indices:
423:       error: function "atomicMin(long long *, long long)" has already been defined

425:   So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
426: */
427: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
428: __device__ static llint atomicMin(llint* address,llint val)
429: {
430:   ullint *address_as_ull = (ullint*)(address);
431:   ullint old = *address_as_ull, assumed;
432:   do {
433:     assumed = old;
434:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val,(llint)assumed)));
435:   } while (assumed != old);
436:   return (llint)old;
437: }

439: __device__ static llint atomicMax(llint* address,llint val)
440: {
441:   ullint *address_as_ull = (ullint*)(address);
442:   ullint old = *address_as_ull, assumed;
443:   do {
444:     assumed = old;
445:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val,(llint)assumed)));
446:   } while (assumed != old);
447:   return (llint)old;
448: }
449: #endif

451: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
452: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};

454: /*
455:   Atomic bitwise operations

457:   CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:

459:   int atomicAnd(int* address, int val);
460:   unsigned int atomicAnd(unsigned int* address,unsigned int val);
461:   unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);

463:   reads the 32-bit or 64-bit word old located at the address address in global or shared
464:   memory, computes (old & val), and stores the result back to memory at the same
465:   address. These three operations are performed in one atomic transaction.
466:   The function returns old.

468:   The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.

470:   atomicOr() and atomicXor are similar.
471: */

473: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
474: __device__ static llint atomicAnd(llint* address,llint val)
475: {
476:   ullint *address_as_ull = (ullint*)(address);
477:   ullint old = *address_as_ull, assumed;
478:   do {
479:     assumed = old;
480:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
481:   } while (assumed != old);
482:   return (llint)old;
483: }
484: __device__ static llint atomicOr(llint* address,llint val)
485: {
486:   ullint *address_as_ull = (ullint*)(address);
487:   ullint old = *address_as_ull, assumed;
488:   do {
489:     assumed = old;
490:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
491:   } while (assumed != old);
492:   return (llint)old;
493: }

495: __device__ static llint atomicXor(llint* address,llint val)
496: {
497:   ullint *address_as_ull = (ullint*)(address);
498:   ullint old = *address_as_ull, assumed;
499:   do {
500:     assumed = old;
501:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
502:   } while (assumed != old);
503:   return (llint)old;
504: }
505: #endif

507: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
508: template<typename Type> struct AtomicBOR  {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
509: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};

511: /*
512:   Atomic logical operations:

514:   CUDA has no atomic logical operations at all. We support them on integer types.
515: */

517: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
518:    which is what we want since we only support 32-bit and 64-bit integers.
519:  */
520: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;

522: template<typename Type,class Op>
523: struct AtomicLogical<Type,Op,4> {
524:   __device__ Type operator()(Type& x,Type y) const {
525:     int *address_as_int = (int*)(&x);
526:     int old = *address_as_int, assumed;
527:     Op op;
528:     do {
529:       assumed = old;
530:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
531:     } while (assumed != old);
532:     return (Type)old;
533:   }
534: };

536: template<typename Type,class Op>
537: struct AtomicLogical<Type,Op,8> {
538:   __device__ Type operator()(Type& x,Type y) const {
539:     ullint *address_as_ull = (ullint*)(&x);
540:     ullint old = *address_as_ull, assumed;
541:     Op op;
542:     do {
543:       assumed = old;
544:       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed,y)));
545:     } while (assumed != old);
546:     return (Type)old;
547:   }
548: };

550: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
551: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
552: template<typename Type> struct lor  {__device__ Type operator()(Type x, Type y) {return x || y;}};
553: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};

555: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
556: template<typename Type> struct AtomicLOR  {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
557: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};

559: /*====================================================================================*/
560: /*  Wrapper functions of cuda kernels. Function pointers are stored in 'link'         */
561: /*====================================================================================*/
562: template<typename Type,PetscInt BS,PetscInt EQ>
563: static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
564: {
565:   PetscInt           nthreads=256;
566:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
567:   const PetscInt     *iarray=opt ? opt->array : NULL;

569:   if (!count) return 0;
570:   if (!opt && !idx) { /* It is a 'CUDA data to nvshmem buf' memory copy */
571:     cudaMemcpyAsync(buf,(char*)data+start*link->unitbytes,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);
572:   } else {
573:     nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
574:     d_Pack<Type,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
575:     cudaGetLastError();
576:   }
577:   return 0;
578: }

580: /* To specialize UnpackAndOp for the cudaMemcpyAsync() below. Usually if this is a contiguous memcpy, we use root/leafdirect and do
581:    not need UnpackAndOp. Only with nvshmem, we need this 'nvshmem buf to CUDA data' memory copy
582: */
583: template<typename Type,PetscInt BS,PetscInt EQ>
584: static PetscErrorCode Unpack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
585: {
586:   PetscInt           nthreads=256;
587:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
588:   const PetscInt     *iarray=opt ? opt->array : NULL;

590:   if (!count) return 0;
591:   if (!opt && !idx) { /* It is a 'nvshmem buf to CUDA data' memory copy */
592:     cudaMemcpyAsync((char*)data+start*link->unitbytes,buf,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);
593:   } else {
594:     nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
595:     d_UnpackAndOp<Type,Insert<Type>,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
596:     cudaGetLastError();
597:   }
598:   return 0;
599: }

601: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
602: static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
603: {
604:   PetscInt           nthreads=256;
605:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
606:   const PetscInt     *iarray=opt ? opt->array : NULL;

608:   if (!count) return 0;
609:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
610:   d_UnpackAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
611:   cudaGetLastError();
612:   return 0;
613: }

615: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
616: static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
617: {
618:   PetscInt           nthreads=256;
619:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
620:   const PetscInt     *iarray=opt ? opt->array : NULL;

622:   if (!count) return 0;
623:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
624:   d_FetchAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
625:   cudaGetLastError();
626:   return 0;
627: }

629: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
630: static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
631: {
632:   PetscInt           nthreads=256;
633:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
634:   PetscInt           srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;

636:   if (!count) return 0;
637:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);

639:   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
640:   if (srcOpt)       {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
641:   else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}

643:   if (dstOpt)       {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
644:   else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}

646:   d_ScatterAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
647:   cudaGetLastError();
648:   return 0;
649: }

651: /* Specialization for Insert since we may use cudaMemcpyAsync */
652: template<typename Type,PetscInt BS,PetscInt EQ>
653: static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
654: {
655:   if (!count) return 0;
656:   /*src and dst are contiguous */
657:   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
658:     cudaMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);
659:   } else {
660:     ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);
661:   }
662:   return 0;
663: }

665: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
666: static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
667: {
668:   PetscInt          nthreads=256;
669:   PetscInt          nblocks=(count+nthreads-1)/nthreads;
670:   const PetscInt    *rarray = rootopt ? rootopt->array : NULL;
671:   const PetscInt    *larray = leafopt ? leafopt->array : NULL;

673:   if (!count) return 0;
674:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
675:   d_FetchAndOpLocal<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
676:   cudaGetLastError();
677:   return 0;
678: }

680: /*====================================================================================*/
681: /*  Init various types and instantiate pack/unpack function pointers                  */
682: /*====================================================================================*/
683: template<typename Type,PetscInt BS,PetscInt EQ>
684: static void PackInit_RealType(PetscSFLink link)
685: {
686:   /* Pack/unpack for remote communication */
687:   link->d_Pack              = Pack<Type,BS,EQ>;
688:   link->d_UnpackAndInsert   = Unpack<Type,BS,EQ>;
689:   link->d_UnpackAndAdd      = UnpackAndOp     <Type,Add<Type>         ,BS,EQ>;
690:   link->d_UnpackAndMult     = UnpackAndOp     <Type,Mult<Type>        ,BS,EQ>;
691:   link->d_UnpackAndMin      = UnpackAndOp     <Type,Min<Type>         ,BS,EQ>;
692:   link->d_UnpackAndMax      = UnpackAndOp     <Type,Max<Type>         ,BS,EQ>;
693:   link->d_FetchAndAdd       = FetchAndOp      <Type,Add<Type>         ,BS,EQ>;

695:   /* Scatter for local communication */
696:   link->d_ScatterAndInsert  = ScatterAndInsert<Type                   ,BS,EQ>; /* Has special optimizations */
697:   link->d_ScatterAndAdd     = ScatterAndOp    <Type,Add<Type>         ,BS,EQ>;
698:   link->d_ScatterAndMult    = ScatterAndOp    <Type,Mult<Type>        ,BS,EQ>;
699:   link->d_ScatterAndMin     = ScatterAndOp    <Type,Min<Type>         ,BS,EQ>;
700:   link->d_ScatterAndMax     = ScatterAndOp    <Type,Max<Type>         ,BS,EQ>;
701:   link->d_FetchAndAddLocal  = FetchAndOpLocal <Type,Add <Type>        ,BS,EQ>;

703:   /* Atomic versions when there are data-race possibilities */
704:   link->da_UnpackAndInsert  = UnpackAndOp     <Type,AtomicInsert<Type>,BS,EQ>;
705:   link->da_UnpackAndAdd     = UnpackAndOp     <Type,AtomicAdd<Type>   ,BS,EQ>;
706:   link->da_UnpackAndMult    = UnpackAndOp     <Type,AtomicMult<Type>  ,BS,EQ>;
707:   link->da_UnpackAndMin     = UnpackAndOp     <Type,AtomicMin<Type>   ,BS,EQ>;
708:   link->da_UnpackAndMax     = UnpackAndOp     <Type,AtomicMax<Type>   ,BS,EQ>;
709:   link->da_FetchAndAdd      = FetchAndOp      <Type,AtomicAdd<Type>   ,BS,EQ>;

711:   link->da_ScatterAndInsert = ScatterAndOp    <Type,AtomicInsert<Type>,BS,EQ>;
712:   link->da_ScatterAndAdd    = ScatterAndOp    <Type,AtomicAdd<Type>   ,BS,EQ>;
713:   link->da_ScatterAndMult   = ScatterAndOp    <Type,AtomicMult<Type>  ,BS,EQ>;
714:   link->da_ScatterAndMin    = ScatterAndOp    <Type,AtomicMin<Type>   ,BS,EQ>;
715:   link->da_ScatterAndMax    = ScatterAndOp    <Type,AtomicMax<Type>   ,BS,EQ>;
716:   link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type>   ,BS,EQ>;
717: }

719: /* Have this templated class to specialize for char integers */
720: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
721: struct PackInit_IntegerType_Atomic {
722:   static void Init(PetscSFLink link)
723:   {
724:     link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
725:     link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
726:     link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
727:     link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
728:     link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
729:     link->da_UnpackAndLAND    = UnpackAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
730:     link->da_UnpackAndLOR     = UnpackAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
731:     link->da_UnpackAndLXOR    = UnpackAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
732:     link->da_UnpackAndBAND    = UnpackAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
733:     link->da_UnpackAndBOR     = UnpackAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
734:     link->da_UnpackAndBXOR    = UnpackAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
735:     link->da_FetchAndAdd      = FetchAndOp <Type,AtomicAdd<Type>   ,BS,EQ>;

737:     link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
738:     link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
739:     link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
740:     link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
741:     link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
742:     link->da_ScatterAndLAND   = ScatterAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
743:     link->da_ScatterAndLOR    = ScatterAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
744:     link->da_ScatterAndLXOR   = ScatterAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
745:     link->da_ScatterAndBAND   = ScatterAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
746:     link->da_ScatterAndBOR    = ScatterAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
747:     link->da_ScatterAndBXOR   = ScatterAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
748:     link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
749:   }
750: };

752: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
753: template<typename Type,PetscInt BS,PetscInt EQ>
754: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
755:   static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
756: };

758: template<typename Type,PetscInt BS,PetscInt EQ>
759: static void PackInit_IntegerType(PetscSFLink link)
760: {
761:   link->d_Pack            = Pack<Type,BS,EQ>;
762:   link->d_UnpackAndInsert = Unpack<Type,BS,EQ>;
763:   link->d_UnpackAndAdd    = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
764:   link->d_UnpackAndMult   = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
765:   link->d_UnpackAndMin    = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
766:   link->d_UnpackAndMax    = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;
767:   link->d_UnpackAndLAND   = UnpackAndOp<Type,LAND<Type>  ,BS,EQ>;
768:   link->d_UnpackAndLOR    = UnpackAndOp<Type,LOR<Type>   ,BS,EQ>;
769:   link->d_UnpackAndLXOR   = UnpackAndOp<Type,LXOR<Type>  ,BS,EQ>;
770:   link->d_UnpackAndBAND   = UnpackAndOp<Type,BAND<Type>  ,BS,EQ>;
771:   link->d_UnpackAndBOR    = UnpackAndOp<Type,BOR<Type>   ,BS,EQ>;
772:   link->d_UnpackAndBXOR   = UnpackAndOp<Type,BXOR<Type>  ,BS,EQ>;
773:   link->d_FetchAndAdd     = FetchAndOp <Type,Add<Type>   ,BS,EQ>;

775:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
776:   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
777:   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
778:   link->d_ScatterAndMin    = ScatterAndOp<Type,Min<Type>   ,BS,EQ>;
779:   link->d_ScatterAndMax    = ScatterAndOp<Type,Max<Type>   ,BS,EQ>;
780:   link->d_ScatterAndLAND   = ScatterAndOp<Type,LAND<Type>  ,BS,EQ>;
781:   link->d_ScatterAndLOR    = ScatterAndOp<Type,LOR<Type>   ,BS,EQ>;
782:   link->d_ScatterAndLXOR   = ScatterAndOp<Type,LXOR<Type>  ,BS,EQ>;
783:   link->d_ScatterAndBAND   = ScatterAndOp<Type,BAND<Type>  ,BS,EQ>;
784:   link->d_ScatterAndBOR    = ScatterAndOp<Type,BOR<Type>   ,BS,EQ>;
785:   link->d_ScatterAndBXOR   = ScatterAndOp<Type,BXOR<Type>  ,BS,EQ>;
786:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
787:   PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
788: }

790: #if defined(PETSC_HAVE_COMPLEX)
791: template<typename Type,PetscInt BS,PetscInt EQ>
792: static void PackInit_ComplexType(PetscSFLink link)
793: {
794:   link->d_Pack             = Pack<Type,BS,EQ>;
795:   link->d_UnpackAndInsert  = Unpack<Type,BS,EQ>;
796:   link->d_UnpackAndAdd     = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
797:   link->d_UnpackAndMult    = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
798:   link->d_FetchAndAdd      = FetchAndOp <Type,Add<Type>   ,BS,EQ>;

800:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
801:   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
802:   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
803:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;

805:   link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
806:   link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
807:   link->da_UnpackAndMult   = NULL; /* Not implemented yet */
808:   link->da_FetchAndAdd     = NULL; /* Return value of atomicAdd on complex is not atomic */

810:   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
811:   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
812: }
813: #endif

815: typedef signed char                      SignedChar;
816: typedef unsigned char                    UnsignedChar;
817: typedef struct {int a;      int b;     } PairInt;
818: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;

820: template<typename Type>
821: static void PackInit_PairType(PetscSFLink link)
822: {
823:   link->d_Pack            = Pack<Type,1,1>;
824:   link->d_UnpackAndInsert = Unpack<Type,1,1>;
825:   link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
826:   link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;

828:   link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
829:   link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
830:   link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
831:   /* Atomics for pair types are not implemented yet */
832: }

834: template<typename Type,PetscInt BS,PetscInt EQ>
835: static void PackInit_DumbType(PetscSFLink link)
836: {
837:   link->d_Pack             = Pack<Type,BS,EQ>;
838:   link->d_UnpackAndInsert  = Unpack<Type,BS,EQ>;
839:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
840:   /* Atomics for dumb types are not implemented yet */
841: }

843: /* Some device-specific utilities */
844: static PetscErrorCode PetscSFLinkSyncDevice_CUDA(PetscSFLink link)
845: {
846:   cudaDeviceSynchronize();
847:   return 0;
848: }

850: static PetscErrorCode PetscSFLinkSyncStream_CUDA(PetscSFLink link)
851: {
852:   cudaStreamSynchronize(link->stream);
853:   return 0;
854: }

856: static PetscErrorCode PetscSFLinkMemcpy_CUDA(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
857: {
858:   enum cudaMemcpyKind kinds[2][2] = {{cudaMemcpyHostToHost,cudaMemcpyHostToDevice},{cudaMemcpyDeviceToHost,cudaMemcpyDeviceToDevice}};

860:   if (n) {
861:     if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call cuda runtime */
862:       PetscMemcpy(dst,src,n);
863:     } else {
864:       int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
865:       int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
866:       cudaMemcpyAsync(dst,src,n,kinds[stype][dtype],link->stream);
867:     }
868:   }
869:   return 0;
870: }

872: PetscErrorCode PetscSFMalloc_CUDA(PetscMemType mtype,size_t size,void** ptr)
873: {
874:   if (PetscMemTypeHost(mtype)) PetscMalloc(size,ptr);
875:   else if (PetscMemTypeDevice(mtype)) {
876:     PetscDeviceInitialize(PETSC_DEVICE_CUDA);
877:     cudaMalloc(ptr,size);
878:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
879:   return 0;
880: }

882: PetscErrorCode PetscSFFree_CUDA(PetscMemType mtype,void* ptr)
883: {
884:   if (PetscMemTypeHost(mtype)) PetscFree(ptr);
885:   else if (PetscMemTypeDevice(mtype)) cudaFree(ptr);
886:   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
887:   return 0;
888: }

890: /* Destructor when the link uses MPI for communication on CUDA device */
891: static PetscErrorCode PetscSFLinkDestroy_MPI_CUDA(PetscSF sf,PetscSFLink link)
892: {
893:   for (int i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
894:     cudaFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
895:     cudaFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
896:   }
897:   return 0;
898: }

900: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
901: PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
902: {
903:   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
904:   PetscBool      is2Int,is2PetscInt;
905: #if defined(PETSC_HAVE_COMPLEX)
906:   PetscInt       nPetscComplex=0;
907: #endif

909:   if (link->deviceinited) return 0;
910:   MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);
911:   MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
912:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
913:   MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);
914:   MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
915:   MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
916: #if defined(PETSC_HAVE_COMPLEX)
917:   MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
918: #endif
919:   MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
920:   MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);

922:   if (is2Int) {
923:     PackInit_PairType<PairInt>(link);
924:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
925:     PackInit_PairType<PairPetscInt>(link);
926:   } else if (nPetscReal) {
927:    #if !defined(PETSC_HAVE_DEVICE)
928:     if      (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
929:     else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
930:     else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
931:     else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0)
932:    #endif
933:     PackInit_RealType<PetscReal,1,0>(link);
934:   } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
935:    #if !defined(PETSC_HAVE_DEVICE)
936:     if      (nPetscInt == 8) PackInit_IntegerType<llint,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<llint,8,0>(link);
937:     else if (nPetscInt == 4) PackInit_IntegerType<llint,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<llint,4,0>(link);
938:     else if (nPetscInt == 2) PackInit_IntegerType<llint,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<llint,2,0>(link);
939:     else if (nPetscInt == 1) PackInit_IntegerType<llint,1,1>(link); else if (nPetscInt%1 == 0)
940:    #endif
941:     PackInit_IntegerType<llint,1,0>(link);
942:   } else if (nInt) {
943:    #if !defined(PETSC_HAVE_DEVICE)
944:     if      (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
945:     else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
946:     else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
947:     else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0)
948:    #endif
949:     PackInit_IntegerType<int,1,0>(link);
950:   } else if (nSignedChar) {
951:    #if !defined(PETSC_HAVE_DEVICE)
952:     if      (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
953:     else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
954:     else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
955:     else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0)
956:    #endif
957:     PackInit_IntegerType<SignedChar,1,0>(link);
958:   }  else if (nUnsignedChar) {
959:    #if !defined(PETSC_HAVE_DEVICE)
960:     if      (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
961:     else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
962:     else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
963:     else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0)
964:    #endif
965:     PackInit_IntegerType<UnsignedChar,1,0>(link);
966: #if defined(PETSC_HAVE_COMPLEX)
967:   } else if (nPetscComplex) {
968:    #if !defined(PETSC_HAVE_DEVICE)
969:     if      (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
970:     else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
971:     else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
972:     else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0)
973:    #endif
974:     PackInit_ComplexType<PetscComplex,1,0>(link);
975: #endif
976:   } else {
977:     MPI_Aint lb,nbyte;
978:     MPI_Type_get_extent(unit,&lb,&nbyte);
980:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
981:      #if !defined(PETSC_HAVE_DEVICE)
982:       if      (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
983:       else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
984:       else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0)
985:      #endif
986:       PackInit_DumbType<char,1,0>(link);
987:     } else {
988:       nInt = nbyte / sizeof(int);
989:      #if !defined(PETSC_HAVE_DEVICE)
990:       if      (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
991:       else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
992:       else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
993:       else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0)
994:      #endif
995:       PackInit_DumbType<int,1,0>(link);
996:     }
997:   }

999:   if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
1000:     int                   device;
1001:     struct cudaDeviceProp props;
1002:     cudaGetDevice(&device);
1003:     cudaGetDeviceProperties(&props,device);
1004:     sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
1005:   }
1006:   link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;

1008:   link->stream             = PetscDefaultCudaStream;
1009:   link->Destroy            = PetscSFLinkDestroy_MPI_CUDA;
1010:   link->SyncDevice         = PetscSFLinkSyncDevice_CUDA;
1011:   link->SyncStream         = PetscSFLinkSyncStream_CUDA;
1012:   link->Memcpy             = PetscSFLinkMemcpy_CUDA;
1013:   link->deviceinited       = PETSC_TRUE;
1014:   return 0;
1015: }