Actual source code: gs.c


  2: /***********************************gs.c***************************************

  4: Author: Henry M. Tufo III

  6: e-mail: hmt@cs.brown.edu

  8: snail-mail:
  9: Division of Applied Mathematics
 10: Brown University
 11: Providence, RI 02912

 13: Last Modification:
 14: 6.21.97
 15: ************************************gs.c**************************************/

 17: /***********************************gs.c***************************************
 18: File Description:
 19: -----------------

 21: ************************************gs.c**************************************/

 23: #include <../src/ksp/pc/impls/tfs/tfs.h>

 25: /* default length of number of items via tree - doubles if exceeded */
 26: #define TREE_BUF_SZ 2048;
 27: #define GS_VEC_SZ   1

 29: /***********************************gs.c***************************************
 30: Type: struct gather_scatter_id
 31: ------------------------------

 33: ************************************gs.c**************************************/
 34: typedef struct gather_scatter_id {
 35:   PetscInt    id;
 36:   PetscInt    nel_min;
 37:   PetscInt    nel_max;
 38:   PetscInt    nel_sum;
 39:   PetscInt    negl;
 40:   PetscInt    gl_max;
 41:   PetscInt    gl_min;
 42:   PetscInt    repeats;
 43:   PetscInt    ordered;
 44:   PetscInt    positive;
 45:   PetscScalar *vals;

 47:   /* bit mask info */
 48:   PetscInt *my_proc_mask;
 49:   PetscInt mask_sz;
 50:   PetscInt *ngh_buf;
 51:   PetscInt ngh_buf_sz;
 52:   PetscInt *nghs;
 53:   PetscInt num_nghs;
 54:   PetscInt max_nghs;
 55:   PetscInt *pw_nghs;
 56:   PetscInt num_pw_nghs;
 57:   PetscInt *tree_nghs;
 58:   PetscInt num_tree_nghs;

 60:   PetscInt num_loads;

 62:   /* repeats == true -> local info */
 63:   PetscInt nel;         /* number of unique elememts */
 64:   PetscInt *elms;       /* of size nel */
 65:   PetscInt nel_total;
 66:   PetscInt *local_elms; /* of size nel_total */
 67:   PetscInt *companion;  /* of size nel_total */

 69:   /* local info */
 70:   PetscInt num_local_total;
 71:   PetscInt local_strength;
 72:   PetscInt num_local;
 73:   PetscInt *num_local_reduce;
 74:   PetscInt **local_reduce;
 75:   PetscInt num_local_gop;
 76:   PetscInt *num_gop_local_reduce;
 77:   PetscInt **gop_local_reduce;

 79:   /* pairwise info */
 80:   PetscInt    level;
 81:   PetscInt    num_pairs;
 82:   PetscInt    max_pairs;
 83:   PetscInt    loc_node_pairs;
 84:   PetscInt    max_node_pairs;
 85:   PetscInt    min_node_pairs;
 86:   PetscInt    avg_node_pairs;
 87:   PetscInt    *pair_list;
 88:   PetscInt    *msg_sizes;
 89:   PetscInt    **node_list;
 90:   PetscInt    len_pw_list;
 91:   PetscInt    *pw_elm_list;
 92:   PetscScalar *pw_vals;

 94:   MPI_Request *msg_ids_in;
 95:   MPI_Request *msg_ids_out;

 97:   PetscScalar *out;
 98:   PetscScalar *in;
 99:   PetscInt    msg_total;

101:   /* tree - crystal accumulator info */
102:   PetscInt max_left_over;
103:   PetscInt *pre;
104:   PetscInt *in_num;
105:   PetscInt *out_num;
106:   PetscInt **in_list;
107:   PetscInt **out_list;

109:   /* new tree work*/
110:   PetscInt    tree_nel;
111:   PetscInt    *tree_elms;
112:   PetscScalar *tree_buf;
113:   PetscScalar *tree_work;

115:   PetscInt tree_map_sz;
116:   PetscInt *tree_map_in;
117:   PetscInt *tree_map_out;

119:   /* current memory status */
120:   PetscInt gl_bss_min;
121:   PetscInt gl_perm_min;

123:   /* max segment size for PCTFS_gs_gop_vec() */
124:   PetscInt vec_sz;

126:   /* hack to make paul happy */
127:   MPI_Comm PCTFS_gs_comm;

129: } PCTFS_gs_id;

131: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135: static PCTFS_gs_id *gsi_new(void);
136: static PetscErrorCode set_tree(PCTFS_gs_id *gs);

138: /* same for all but vector flavor */
139: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140: /* vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

143: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

149: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);

152: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);

156: /* global vars */
157: /* from comm.c module */

159: static PetscInt num_gs_ids = 0;

161: /* should make this dynamic ... later */
162: static PetscInt msg_buf    =MAX_MSG_BUF;
163: static PetscInt vec_sz     =GS_VEC_SZ;
164: static PetscInt *tree_buf  =NULL;
165: static PetscInt tree_buf_sz=0;
166: static PetscInt ntree      =0;

168: /***************************************************************************/
169: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170: {
171:   vec_sz = size;
172:   return 0;
173: }

175: /******************************************************************************/
176: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
177: {
178:   msg_buf = buf_size;
179:   return 0;
180: }

182: /******************************************************************************/
183: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
184: {
185:   PCTFS_gs_id    *gs;
186:   MPI_Group      PCTFS_gs_group;
187:   MPI_Comm       PCTFS_gs_comm;

189:   /* ensure that communication package has been initialized */
190:   PCTFS_comm_init();

192:   /* determines if we have enough dynamic/semi-static memory */
193:   /* checks input, allocs and sets gd_id template            */
194:   gs = gsi_check_args(elms,nel,level);

196:   /* only bit mask version up and working for the moment    */
197:   /* LATER :: get int list version working for sparse pblms */
198:   PETSC_COMM_WORLD,gsi_via_bit_mask(gs);

200:   PETSC_COMM_WORLD,MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);
201:   PETSC_COMM_WORLD,MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);
202:   PETSC_COMM_WORLD,MPI_Group_free(&PCTFS_gs_group);

204:   gs->PCTFS_gs_comm=PCTFS_gs_comm;

206:   return(gs);
207: }

209: /******************************************************************************/
210: static PCTFS_gs_id *gsi_new(void)
211: {
212:   PCTFS_gs_id    *gs;
213:   gs   = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
214:   PETSC_COMM_WORLD,PetscMemzero(gs,sizeof(PCTFS_gs_id));
215:   return(gs);
216: }

218: /******************************************************************************/
219: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
220: {
221:   PetscInt       i, j, k, t2;
222:   PetscInt       *companion, *elms, *unique, *iptr;
223:   PetscInt       num_local=0, *num_to_reduce, **local_reduce;
224:   PetscInt       oprs[]   = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
225:   PetscInt       vals[sizeof(oprs)/sizeof(oprs[0])-1];
226:   PetscInt       work[sizeof(oprs)/sizeof(oprs[0])-1];
227:   PCTFS_gs_id    *gs;

229:   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
230:   if (nel<0)    SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");

232:   if (nel==0) PETSC_COMM_WORLD,PetscInfo(0,"I don't have any elements!!!\n");

234:   /* get space for gs template */
235:   gs     = gsi_new();
236:   gs->id = ++num_gs_ids;

238:   /* hmt 6.4.99                                            */
239:   /* caller can set global ids that don't participate to 0 */
240:   /* PCTFS_gs_init ignores all zeros in elm list                 */
241:   /* negative global ids are still invalid                 */
242:   for (i=j=0; i<nel; i++) {
243:     if (in_elms[i]!=0) j++;
244:   }

246:   k=nel; nel=j;

248:   /* copy over in_elms list and create inverse map */
249:   elms      = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
250:   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));

252:   for (i=j=0; i<k; i++) {
253:     if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
254:   }

256:   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");

258:   /* pre-pass ... check to see if sorted */
259:   elms[nel] = INT_MAX;
260:   iptr      = elms;
261:   unique    = elms+1;
262:   j         =0;
263:   while (*iptr!=INT_MAX) {
264:     if (*iptr++>*unique++) { j=1; break; }
265:   }

267:   /* set up inverse map */
268:   if (j) {
269:     PETSC_COMM_WORLD,PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");
270:     PETSC_COMM_WORLD,PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);
271:   } else PETSC_COMM_WORLD,PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");
272:   elms[nel] = INT_MIN;

274:   /* first pass */
275:   /* determine number of unique elements, check pd */
276:   for (i=k=0; i<nel; i+=j) {
277:     t2 = elms[i];
278:     j  = ++i;

280:     /* clump 'em for now */
281:     while (elms[j]==t2) j++;

283:     /* how many together and num local */
284:     if (j-=i) { num_local++; k+=j; }
285:   }

287:   /* how many unique elements? */
288:   gs->repeats = k;
289:   gs->nel     = nel-k;

291:   /* number of repeats? */
292:   gs->num_local        = num_local;
293:   num_local           += 2;
294:   gs->local_reduce     = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
295:   gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));

297:   unique         = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
298:   gs->elms       = unique;
299:   gs->nel_total  = nel;
300:   gs->local_elms = elms;
301:   gs->companion  = companion;

303:   /* compess map as well as keep track of local ops */
304:   for (num_local=i=j=0; i<gs->nel; i++) {
305:     k            = j;
306:     t2           = unique[i] = elms[j];
307:     companion[i] = companion[j];

309:     while (elms[j]==t2) j++;

311:     if ((t2=(j-k))>1) {
312:       /* number together */
313:       num_to_reduce[num_local] = t2++;

315:       iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));

317:       /* to use binary searching don't remap until we check intersection */
318:       *iptr++ = i;

320:       /* note that we're skipping the first one */
321:       while (++k<j) *(iptr++) = companion[k];
322:       *iptr = -1;
323:     }
324:   }

326:   /* sentinel for ngh_buf */
327:   unique[gs->nel]=INT_MAX;

329:   /* for two partition sort hack */
330:   num_to_reduce[num_local]   = 0;
331:   local_reduce[num_local]    = NULL;
332:   num_to_reduce[++num_local] = 0;
333:   local_reduce[num_local]    = NULL;

335:   /* load 'em up */
336:   /* note one extra to hold NON_UNIFORM flag!!! */
337:   vals[2] = vals[1] = vals[0] = nel;
338:   if (gs->nel>0) {
339:     vals[3] = unique[0];
340:     vals[4] = unique[gs->nel-1];
341:   } else {
342:     vals[3] = INT_MAX;
343:     vals[4] = INT_MIN;
344:   }
345:   vals[5] = level;
346:   vals[6] = num_gs_ids;

348:   /* GLOBAL: send 'em out */
349:   PETSC_COMM_WORLD,PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);

351:   /* must be semi-pos def - only pairwise depends on this */
352:   /* LATER - remove this restriction */
353:   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
354:   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");

356:   gs->nel_min = vals[0];
357:   gs->nel_max = vals[1];
358:   gs->nel_sum = vals[2];
359:   gs->gl_min  = vals[3];
360:   gs->gl_max  = vals[4];
361:   gs->negl    = vals[4]-vals[3]+1;

363:   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");

365:   /* LATER :: add level == -1 -> program selects level */
366:   if (vals[5]<0) vals[5]=0;
367:   else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
368:   gs->level = vals[5];

370:   return(gs);
371: }

373: /******************************************************************************/
374: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
375: {
376:   PetscInt       i, nel, *elms;
377:   PetscInt       t1;
378:   PetscInt       **reduce;
379:   PetscInt       *map;

381:   /* totally local removes ... PCTFS_ct_bits == 0 */
382:   get_ngh_buf(gs);

384:   if (gs->level) set_pairwise(gs);
385:   if (gs->max_left_over) set_tree(gs);

387:   /* intersection local and pairwise/tree? */
388:   gs->num_local_total      = gs->num_local;
389:   gs->gop_local_reduce     = gs->local_reduce;
390:   gs->num_gop_local_reduce = gs->num_local_reduce;

392:   map = gs->companion;

394:   /* is there any local compression */
395:   if (!gs->num_local) {
396:     gs->local_strength = NONE;
397:     gs->num_local_gop  = 0;
398:   } else {
399:     /* ok find intersection */
400:     map    = gs->companion;
401:     reduce = gs->local_reduce;
402:     for (i=0, t1=0; i<gs->num_local; i++, reduce++) {
403:       if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
404:         t1++;
406:         gs->num_local_reduce[i] *= -1;
407:       }
408:       **reduce=map[**reduce];
409:     }

411:     /* intersection is empty */
412:     if (!t1) {
413:       gs->local_strength = FULL;
414:       gs->num_local_gop  = 0;
415:     } else { /* intersection not empty */
416:       gs->local_strength = PARTIAL;

418:       PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);

420:       gs->num_local_gop        = t1;
421:       gs->num_local_total      =  gs->num_local;
422:       gs->num_local           -= t1;
423:       gs->gop_local_reduce     = gs->local_reduce;
424:       gs->num_gop_local_reduce = gs->num_local_reduce;

426:       for (i=0; i<t1; i++) {
428:         gs->num_gop_local_reduce[i] *= -1;
429:         gs->local_reduce++;
430:         gs->num_local_reduce++;
431:       }
432:       gs->local_reduce++;
433:       gs->num_local_reduce++;
434:     }
435:   }

437:   elms = gs->pw_elm_list;
438:   nel  = gs->len_pw_list;
439:   for (i=0; i<nel; i++) elms[i] = map[elms[i]];

441:   elms = gs->tree_map_in;
442:   nel  = gs->tree_map_sz;
443:   for (i=0; i<nel; i++) elms[i] = map[elms[i]];

445:   /* clean up */
446:   free((void*) gs->local_elms);
447:   free((void*) gs->companion);
448:   free((void*) gs->elms);
449:   free((void*) gs->ngh_buf);
450:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
451:   return 0;
452: }

454: /******************************************************************************/
455: static PetscErrorCode place_in_tree(PetscInt elm)
456: {
457:   PetscInt *tp, n;

459:   if (ntree==tree_buf_sz) {
460:     if (tree_buf_sz) {
461:       tp           = tree_buf;
462:       n            = tree_buf_sz;
463:       tree_buf_sz<<=1;
464:       tree_buf     = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
465:       PCTFS_ivec_copy(tree_buf,tp,n);
466:       free(tp);
467:     } else {
468:       tree_buf_sz = TREE_BUF_SZ;
469:       tree_buf    = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
470:     }
471:   }

473:   tree_buf[ntree++] = elm;
474:   return 0;
475: }

477: /******************************************************************************/
478: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
479: {
480:   PetscInt       i, j, npw=0, ntree_map=0;
481:   PetscInt       p_mask_size, ngh_buf_size, buf_size;
482:   PetscInt       *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
483:   PetscInt       *ngh_buf, *buf1, *buf2;
484:   PetscInt       offset, per_load, num_loads, or_ct, start, end;
485:   PetscInt       *ptr1, *ptr2, i_start, negl, nel, *elms;
486:   PetscInt       oper=GL_B_OR;
487:   PetscInt       *ptr3, *t_mask, level, ct1, ct2;

489:   /* to make life easier */
490:   nel   = gs->nel;
491:   elms  = gs->elms;
492:   level = gs->level;

494:   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
495:   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
496:   PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);

498:   /* allocate space for masks and info bufs */
499:   gs->nghs       = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
500:   gs->pw_nghs    = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
501:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
502:   t_mask         = (PetscInt*) malloc(p_mask_size);
503:   gs->ngh_buf    = ngh_buf = (PetscInt*) malloc(ngh_buf_size);

505:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
506:   /* had thought I could exploit rendezvous threshold */

508:   /* default is one pass */
509:   per_load      = negl  = gs->negl;
510:   gs->num_loads = num_loads = 1;
511:   i             = p_mask_size*negl;

513:   /* possible overflow on buffer size */
514:   /* overflow hack                    */
515:   if (i<0) i=INT_MAX;

517:   buf_size = PetscMin(msg_buf,i);

519:   /* can we do it? */

522:   /* get PCTFS_giop buf space ... make *only* one malloc */
523:   buf1 = (PetscInt*) malloc(buf_size<<1);

525:   /* more than one gior exchange needed? */
526:   if (buf_size!=i) {
527:     per_load      = buf_size/p_mask_size;
528:     buf_size      = per_load*p_mask_size;
529:     gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
530:   }

532:   /* convert buf sizes from #bytes to #ints - 32 bit only! */
533:   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);

535:   /* find PCTFS_giop work space */
536:   buf2 = buf1+buf_size;

538:   /* hold #ints needed for processor masks */
539:   gs->mask_sz=p_mask_size;

541:   /* init buffers */
542:   PCTFS_ivec_zero(sh_proc_mask,p_mask_size);
543:   PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);
544:   PCTFS_ivec_zero(ngh_buf,ngh_buf_size);

546:   /* HACK reset tree info */
547:   tree_buf    = NULL;
548:   tree_buf_sz = ntree = 0;

550:   /* ok do it */
551:   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
552:     /* identity for bitwise or is 000...000 */
553:     PCTFS_ivec_zero(buf1,buf_size);

555:     /* load msg buffer */
556:     for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
557:       offset = (offset-start)*p_mask_size;
558:       PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
559:     }

561:     /* GLOBAL: pass buffer */
562:     PCTFS_giop(buf1,buf2,buf_size,&oper);

564:     /* unload buffer into ngh_buf */
565:     ptr2=(elms+i_start);
566:     for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
567:       /* I own it ... may have to pairwise it */
568:       if (j==*ptr2) {
569:         /* do i share it w/anyone? */
570:         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
571:         /* guess not */
572:         if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }

574:         /* i do ... so keep info and turn off my bit */
575:         PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
576:         PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);
577:         PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);

579:         /* is it to be done pairwise? */
580:         if (--ct1<=level) {
581:           npw++;

583:           /* turn on high bit to indicate pw need to process */
584:           *ptr2++ |= TOP_BIT;
585:           PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
586:           ptr1    += p_mask_size;
587:           continue;
588:         }

590:         /* get set for next and note that I have a tree contribution */
591:         /* could save exact elm index for tree here -> save a search */
592:         ptr2++; ptr1+=p_mask_size; ntree_map++;
593:       } else { /* i don't but still might be involved in tree */

595:         /* shared by how many? */
596:         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));

598:         /* none! */
599:         if (ct1<2) continue;

601:         /* is it going to be done pairwise? but not by me of course!*/
602:         if (--ct1<=level) continue;
603:       }
604:       /* LATER we're going to have to process it NOW */
605:       /* nope ... tree it */
606:       place_in_tree(j);
607:     }
608:   }

610:   free((void*)t_mask);
611:   free((void*)buf1);

613:   gs->len_pw_list = npw;
614:   gs->num_nghs    = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));

616:   /* expand from bit mask list to int list and save ngh list */
617:   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
618:   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);

620:   gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));

622:   oper         = GL_MAX;
623:   ct1          = gs->num_nghs;
624:   PCTFS_giop(&ct1,&ct2,1,&oper);
625:   gs->max_nghs = ct1;

627:   gs->tree_map_sz  = ntree_map;
628:   gs->max_left_over=ntree;

630:   free((void*)p_mask);
631:   free((void*)sh_proc_mask);
632:   return 0;
633: }

635: /******************************************************************************/
636: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
637: {
638:   PetscInt       i, j;
639:   PetscInt       p_mask_size;
640:   PetscInt       *p_mask, *sh_proc_mask, *tmp_proc_mask;
641:   PetscInt       *ngh_buf, *buf2;
642:   PetscInt       offset;
643:   PetscInt       *msg_list, *msg_size, **msg_nodes, nprs;
644:   PetscInt       *pairwise_elm_list, len_pair_list=0;
645:   PetscInt       *iptr, t1, i_start, nel, *elms;
646:   PetscInt       ct;

648:   /* to make life easier */
649:   nel          = gs->nel;
650:   elms         = gs->elms;
651:   ngh_buf      = gs->ngh_buf;
652:   sh_proc_mask = gs->pw_nghs;

654:   /* need a few temp masks */
655:   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
656:   p_mask        = (PetscInt*) malloc(p_mask_size);
657:   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);

659:   /* set mask to my PCTFS_my_id's bit mask */
660:   PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);

662:   p_mask_size /= sizeof(PetscInt);

664:   len_pair_list   = gs->len_pw_list;
665:   gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));

667:   /* how many processors (nghs) do we have to exchange with? */
668:   nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));

670:   /* allocate space for PCTFS_gs_gop() info */
671:   gs->pair_list = msg_list  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
672:   gs->msg_sizes = msg_size  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
673:   gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));

675:   /* init msg_size list */
676:   PCTFS_ivec_zero(msg_size,nprs);

678:   /* expand from bit mask list to int list */
679:   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);

681:   /* keep list of elements being handled pairwise */
682:   for (i=j=0; i<nel; i++) {
683:     if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
684:   }
685:   pairwise_elm_list[j] = -1;

687:   gs->msg_ids_out       = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
688:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
689:   gs->msg_ids_in        = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
690:   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
691:   gs->pw_vals           = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);

693:   /* find who goes to each processor */
694:   for (i_start=i=0; i<nprs; i++) {
695:     /* processor i's mask */
696:     PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);

698:     /* det # going to processor i */
699:     for (ct=j=0; j<len_pair_list; j++) {
700:       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
701:       PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
702:       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
703:     }
704:     msg_size[i] = ct;
705:     i_start     = PetscMax(i_start,ct);

707:     /*space to hold nodes in message to first neighbor */
708:     msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));

710:     for (j=0;j<len_pair_list;j++) {
711:       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
712:       PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
713:       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
714:     }
715:     *iptr = -1;
716:   }
717:   msg_nodes[nprs] = NULL;

719:   j                  = gs->loc_node_pairs=i_start;
720:   t1                 = GL_MAX;
721:   PCTFS_giop(&i_start,&offset,1,&t1);
722:   gs->max_node_pairs = i_start;

724:   i_start            = j;
725:   t1                 = GL_MIN;
726:   PCTFS_giop(&i_start,&offset,1,&t1);
727:   gs->min_node_pairs = i_start;

729:   i_start            = j;
730:   t1                 = GL_ADD;
731:   PCTFS_giop(&i_start,&offset,1,&t1);
732:   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;

734:   i_start = nprs;
735:   t1      = GL_MAX;
736:   PCTFS_giop(&i_start,&offset,1,&t1);
737:   gs->max_pairs = i_start;

739:   /* remap pairwise in tail of gsi_via_bit_mask() */
740:   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
741:   gs->out       = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
742:   gs->in        = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);

744:   /* reset malloc pool */
745:   free((void*)p_mask);
746:   free((void*)tmp_proc_mask);
747:   return 0;
748: }

750: /* to do pruned tree just save ngh buf copy for each one and decode here!
751: ******************************************************************************/
752: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
753: {
754:   PetscInt i, j, n, nel;
755:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

757:   /* local work ptrs */
758:   elms = gs->elms;
759:   nel  = gs->nel;

761:   /* how many via tree */
762:   gs->tree_nel     = n = ntree;
763:   gs->tree_elms    = tree_elms = iptr_in = tree_buf;
764:   gs->tree_buf     = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
765:   gs->tree_work    = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
766:   j                = gs->tree_map_sz;
767:   gs->tree_map_in  = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
768:   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));

770:   /* search the longer of the two lists */
771:   /* note ... could save this info in get_ngh_buf and save searches */
772:   if (n<=nel) {
773:     /* bijective fct w/remap - search elm list */
774:     for (i=0; i<n; i++) {
775:       if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
776:     }
777:   } else {
778:     for (i=0; i<nel; i++) {
779:       if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
780:     }
781:   }

783:   /* sentinel */
784:   *iptr_in = *iptr_out = -1;
785:   return 0;
786: }

788: /******************************************************************************/
789: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs,  PetscScalar *vals)
790: {
791:   PetscInt    *num, *map, **reduce;
792:   PetscScalar tmp;

794:   num    = gs->num_gop_local_reduce;
795:   reduce = gs->gop_local_reduce;
796:   while ((map = *reduce++)) {
797:     /* wall */
798:     if (*num == 2) {
799:       num++;
800:       vals[map[1]] = vals[map[0]];
801:     } else if (*num == 3) { /* corner shared by three elements */
802:       num++;
803:       vals[map[2]] = vals[map[1]] = vals[map[0]];
804:     } else if (*num == 4) { /* corner shared by four elements */
805:       num++;
806:       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
807:     } else { /* general case ... odd geoms ... 3D*/
808:       num++;
809:       tmp = *(vals + *map++);
810:       while (*map >= 0) *(vals + *map++) = tmp;
811:     }
812:   }
813:   return 0;
814: }

816: /******************************************************************************/
817: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
818: {
819:   PetscInt    *num, *map, **reduce;
820:   PetscScalar tmp;

822:   num    = gs->num_local_reduce;
823:   reduce = gs->local_reduce;
824:   while ((map = *reduce)) {
825:     /* wall */
826:     if (*num == 2) {
827:       num++; reduce++;
828:       vals[map[1]] = vals[map[0]] += vals[map[1]];
829:     } else if (*num == 3) { /* corner shared by three elements */
830:       num++; reduce++;
831:       vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
832:     } else if (*num == 4) { /* corner shared by four elements */
833:       num++; reduce++;
834:       vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
835:     } else { /* general case ... odd geoms ... 3D*/
836:       num++;
837:       tmp = 0.0;
838:       while (*map >= 0) tmp += *(vals + *map++);

840:       map = *reduce++;
841:       while (*map >= 0) *(vals + *map++) = tmp;
842:     }
843:   }
844:   return 0;
845: }

847: /******************************************************************************/
848: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
849: {
850:   PetscInt    *num, *map, **reduce;
851:   PetscScalar *base;

853:   num    = gs->num_gop_local_reduce;
854:   reduce = gs->gop_local_reduce;
855:   while ((map = *reduce++)) {
856:     /* wall */
857:     if (*num == 2) {
858:       num++;
859:       vals[map[0]] += vals[map[1]];
860:     } else if (*num == 3) { /* corner shared by three elements */
861:       num++;
862:       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
863:     } else if (*num == 4) { /* corner shared by four elements */
864:       num++;
865:       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
866:     } else { /* general case ... odd geoms ... 3D*/
867:       num++;
868:       base = vals + *map++;
869:       while (*map >= 0) *base += *(vals + *map++);
870:     }
871:   }
872:   return 0;
873: }

875: /******************************************************************************/
876: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
877: {
878:   PetscInt       i;

880:   MPI_Comm_free(&gs->PCTFS_gs_comm);
881:   if (gs->nghs) free((void*) gs->nghs);
882:   if (gs->pw_nghs) free((void*) gs->pw_nghs);

884:   /* tree */
885:   if (gs->max_left_over) {
886:     if (gs->tree_elms) free((void*) gs->tree_elms);
887:     if (gs->tree_buf) free((void*) gs->tree_buf);
888:     if (gs->tree_work) free((void*) gs->tree_work);
889:     if (gs->tree_map_in) free((void*) gs->tree_map_in);
890:     if (gs->tree_map_out) free((void*) gs->tree_map_out);
891:   }

893:   /* pairwise info */
894:   if (gs->num_pairs) {
895:     /* should be NULL already */
896:     if (gs->ngh_buf) free((void*) gs->ngh_buf);
897:     if (gs->elms) free((void*) gs->elms);
898:     if (gs->local_elms) free((void*) gs->local_elms);
899:     if (gs->companion) free((void*) gs->companion);

901:     /* only set if pairwise */
902:     if (gs->vals) free((void*) gs->vals);
903:     if (gs->in) free((void*) gs->in);
904:     if (gs->out) free((void*) gs->out);
905:     if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
906:     if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
907:     if (gs->pw_vals) free((void*) gs->pw_vals);
908:     if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
909:     if (gs->node_list) {
910:       for (i=0;i<gs->num_pairs;i++) {
911:         if (gs->node_list[i])  {
912:           free((void*) gs->node_list[i]);
913:         }
914:       }
915:       free((void*) gs->node_list);
916:     }
917:     if (gs->msg_sizes) free((void*) gs->msg_sizes);
918:     if (gs->pair_list) free((void*) gs->pair_list);
919:   }

921:   /* local info */
922:   if (gs->num_local_total>=0) {
923:     for (i=0;i<gs->num_local_total+1;i++) {
924:       if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
925:     }
926:   }

928:   /* if intersection tree/pairwise and local isn't empty */
929:   if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
930:   if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);

932:   free((void*) gs);
933:   return 0;
934: }

936: /******************************************************************************/
937: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
938: {
939:   switch (*op) {
940:   case '+':
941:     PCTFS_gs_gop_vec_plus(gs,vals,step);
942:     break;
943:   default:
944:     PetscInfo(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);
945:     PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");
946:     PCTFS_gs_gop_vec_plus(gs,vals,step);
947:     break;
948:   }
949:   return 0;
950: }

952: /******************************************************************************/
953: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
954: {

957:   /* local only operations!!! */
958:   if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);

960:   /* if intersection tree/pairwise and local isn't empty */
961:   if (gs->num_local_gop) {
962:     PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);

964:     /* pairwise */
965:     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);

967:     /* tree */
968:     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);

970:     PCTFS_gs_gop_vec_local_out(gs,vals,step);
971:   } else { /* if intersection tree/pairwise and local is empty */
972:     /* pairwise */
973:     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);

975:     /* tree */
976:     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
977:   }
978:   return 0;
979: }

981: /******************************************************************************/
982: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
983: {
984:   PetscInt    *num, *map, **reduce;
985:   PetscScalar *base;

987:   num    = gs->num_local_reduce;
988:   reduce = gs->local_reduce;
989:   while ((map = *reduce)) {
990:     base = vals + map[0] * step;

992:     /* wall */
993:     if (*num == 2) {
994:       num++; reduce++;
995:       PCTFS_rvec_add (base,vals+map[1]*step,step);
996:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
997:     } else if (*num == 3) { /* corner shared by three elements */
998:       num++; reduce++;
999:       PCTFS_rvec_add (base,vals+map[1]*step,step);
1000:       PCTFS_rvec_add (base,vals+map[2]*step,step);
1001:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1002:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1003:     } else if (*num == 4) { /* corner shared by four elements */
1004:       num++; reduce++;
1005:       PCTFS_rvec_add (base,vals+map[1]*step,step);
1006:       PCTFS_rvec_add (base,vals+map[2]*step,step);
1007:       PCTFS_rvec_add (base,vals+map[3]*step,step);
1008:       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1009:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1010:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1011:     } else { /* general case ... odd geoms ... 3D */
1012:       num++;
1013:       while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);

1015:       map = *reduce;
1016:       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);

1018:       reduce++;
1019:     }
1020:   }
1021:   return 0;
1022: }

1024: /******************************************************************************/
1025: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1026: {
1027:   PetscInt    *num, *map, **reduce;
1028:   PetscScalar *base;

1030:   num    = gs->num_gop_local_reduce;
1031:   reduce = gs->gop_local_reduce;
1032:   while ((map = *reduce++)) {
1033:     base = vals + map[0] * step;

1035:     /* wall */
1036:     if (*num == 2) {
1037:       num++;
1038:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1039:     } else if (*num == 3) { /* corner shared by three elements */
1040:       num++;
1041:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1042:       PCTFS_rvec_add(base,vals+map[2]*step,step);
1043:     } else if (*num == 4) { /* corner shared by four elements */
1044:       num++;
1045:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1046:       PCTFS_rvec_add(base,vals+map[2]*step,step);
1047:       PCTFS_rvec_add(base,vals+map[3]*step,step);
1048:     } else { /* general case ... odd geoms ... 3D*/
1049:       num++;
1050:       while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1051:     }
1052:   }
1053:   return 0;
1054: }

1056: /******************************************************************************/
1057: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1058: {
1059:   PetscInt    *num, *map, **reduce;
1060:   PetscScalar *base;

1062:   num    = gs->num_gop_local_reduce;
1063:   reduce = gs->gop_local_reduce;
1064:   while ((map = *reduce++)) {
1065:     base = vals + map[0] * step;

1067:     /* wall */
1068:     if (*num == 2) {
1069:       num++;
1070:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1071:     } else if (*num == 3) { /* corner shared by three elements */
1072:       num++;
1073:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1074:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1075:     } else if (*num == 4) { /* corner shared by four elements */
1076:       num++;
1077:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1078:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1079:       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1080:     } else { /* general case ... odd geoms ... 3D*/
1081:       num++;
1082:       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1083:     }
1084:   }
1085:   return 0;
1086: }

1088: /******************************************************************************/
1089: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1090: {
1091:   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1092:   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1093:   PetscInt       *pw, *list, *size, **nodes;
1094:   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1095:   MPI_Status     status;
1096:   PetscBLASInt   i1 = 1,dstep;

1098:   /* strip and load s */
1099:   msg_list    = list     = gs->pair_list;
1100:   msg_size    = size     = gs->msg_sizes;
1101:   msg_nodes   = nodes    = gs->node_list;
1102:   iptr        = pw       = gs->pw_elm_list;
1103:   dptr1       = dptr3    = gs->pw_vals;
1104:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1105:   msg_ids_out = ids_out  = gs->msg_ids_out;
1106:   dptr2                  = gs->out;
1107:   in1=in2                = gs->in;

1109:   /* post the receives */
1110:   /*  msg_nodes=nodes; */
1111:   do {
1112:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1113:         second one *list and do list++ afterwards */
1114:     MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1115:     list++;msg_ids_in++;
1116:     in1 += *size++ *step;
1117:   } while (*++msg_nodes);
1118:   msg_nodes=nodes;

1120:   /* load gs values into in out gs buffers */
1121:   while (*iptr >= 0) {
1122:     PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1123:     dptr3+=step;
1124:     iptr++;
1125:   }

1127:   /* load out buffers and post the sends */
1128:   while ((iptr = *msg_nodes++)) {
1129:     dptr3 = dptr2;
1130:     while (*iptr >= 0) {
1131:       PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1132:       dptr2+=step;
1133:       iptr++;
1134:     }
1135:     MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1136:     msg_size++; msg_list++;msg_ids_out++;
1137:   }

1139:   /* tree */
1140:   if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);

1142:   /* process the received data */
1143:   msg_nodes=nodes;
1144:   while ((iptr = *nodes++)) {
1145:     PetscScalar d1 = 1.0;

1147:     /* Should I check the return value of MPI_Wait() or status? */
1148:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1149:     MPI_Wait(ids_in, &status);
1150:     ids_in++;
1151:     while (*iptr >= 0) {
1152:       PetscBLASIntCast(step,&dstep);
1153:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1154:       in2+=step;
1155:       iptr++;
1156:     }
1157:   }

1159:   /* replace vals */
1160:   while (*pw >= 0) {
1161:     PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1162:     dptr1+=step;
1163:     pw++;
1164:   }

1166:   /* clear isend message handles */
1167:   /* This changed for clarity though it could be the same */

1169:   /* Should I check the return value of MPI_Wait() or status? */
1170:   /* Can this loop be replaced by a call to MPI_Waitall()? */
1171:   while (*msg_nodes++) {
1172:     MPI_Wait(ids_out, &status);
1173:     ids_out++;
1174:   }
1175:   return 0;
1176: }

1178: /******************************************************************************/
1179: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1180: {
1181:   PetscInt       size, *in, *out;
1182:   PetscScalar    *buf, *work;
1183:   PetscInt       op[] = {GL_ADD,0};
1184:   PetscBLASInt   i1   = 1;
1185:   PetscBLASInt   dstep;

1187:   /* copy over to local variables */
1188:   in   = gs->tree_map_in;
1189:   out  = gs->tree_map_out;
1190:   buf  = gs->tree_buf;
1191:   work = gs->tree_work;
1192:   size = gs->tree_nel*step;

1194:   /* zero out collection buffer */
1195:   PCTFS_rvec_zero(buf,size);

1197:   /* copy over my contributions */
1198:   while (*in >= 0) {
1199:     PetscBLASIntCast(step,&dstep);
1200:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1201:   }

1203:   /* perform fan in/out on full buffer */
1204:   /* must change PCTFS_grop to handle the blas */
1205:   PCTFS_grop(buf,work,size,op);

1207:   /* reset */
1208:   in  = gs->tree_map_in;
1209:   out = gs->tree_map_out;

1211:   /* get the portion of the results I need */
1212:   while (*in >= 0) {
1213:     PetscBLASIntCast(step,&dstep);
1214:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1215:   }
1216:   return 0;
1217: }

1219: /******************************************************************************/
1220: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1221: {
1222:   switch (*op) {
1223:   case '+':
1224:     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1225:     break;
1226:   default:
1227:     PetscInfo(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);
1228:     PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");
1229:     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1230:     break;
1231:   }
1232:   return 0;
1233: }

1235: /******************************************************************************/
1236: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1237: {
1238:   /* if there's nothing to do return */
1239:   if (dim<=0) return 0;

1241:   /* can't do more dimensions then exist */
1242:   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);

1244:   /* local only operations!!! */
1245:   if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);

1247:   /* if intersection tree/pairwise and local isn't empty */
1248:   if (gs->num_local_gop) {
1249:     PCTFS_gs_gop_local_in_plus(gs,vals);

1251:     /* pairwise will do tree inside ... */
1252:     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1253:     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);

1255:     PCTFS_gs_gop_local_out(gs,vals);
1256:   } else { /* if intersection tree/pairwise and local is empty */
1257:     /* pairwise will do tree inside */
1258:     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1259:     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1260:   }
1261:   return 0;
1262: }

1264: /******************************************************************************/
1265: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1266: {
1267:   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1268:   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1269:   PetscInt       *pw, *list, *size, **nodes;
1270:   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1271:   MPI_Status     status;
1272:   PetscInt       i, mask=1;

1274:   for (i=1; i<dim; i++) { mask<<=1; mask++; }

1276:   /* strip and load s */
1277:   msg_list    = list     = gs->pair_list;
1278:   msg_size    = size     = gs->msg_sizes;
1279:   msg_nodes   = nodes    = gs->node_list;
1280:   iptr        = pw       = gs->pw_elm_list;
1281:   dptr1       = dptr3    = gs->pw_vals;
1282:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1283:   msg_ids_out = ids_out  = gs->msg_ids_out;
1284:   dptr2       = gs->out;
1285:   in1         = in2      = gs->in;

1287:   /* post the receives */
1288:   /*  msg_nodes=nodes; */
1289:   do {
1290:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1291:         second one *list and do list++ afterwards */
1292:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1293:       MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1294:       list++; msg_ids_in++;in1 += *size++;
1295:     } else { list++; size++; }
1296:   } while (*++msg_nodes);

1298:   /* load gs values into in out gs buffers */
1299:   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);

1301:   /* load out buffers and post the sends */
1302:   msg_nodes=nodes;
1303:   list     = msg_list;
1304:   while ((iptr = *msg_nodes++)) {
1305:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1306:       dptr3 = dptr2;
1307:       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1308:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1309:       /* is msg_ids_out++ correct? */
1310:       MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1311:       msg_size++;list++;msg_ids_out++;
1312:     } else {list++; msg_size++;}
1313:   }

1315:   /* do the tree while we're waiting */
1316:   if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);

1318:   /* process the received data */
1319:   msg_nodes=nodes;
1320:   list     = msg_list;
1321:   while ((iptr = *nodes++)) {
1322:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1323:       /* Should I check the return value of MPI_Wait() or status? */
1324:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1325:       MPI_Wait(ids_in, &status);
1326:       ids_in++;
1327:       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1328:     }
1329:     list++;
1330:   }

1332:   /* replace vals */
1333:   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;

1335:   /* clear isend message handles */
1336:   /* This changed for clarity though it could be the same */
1337:   while (*msg_nodes++) {
1338:     if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1339:       /* Should I check the return value of MPI_Wait() or status? */
1340:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1341:       MPI_Wait(ids_out, &status);
1342:       ids_out++;
1343:     }
1344:     msg_list++;
1345:   }
1346:   return 0;
1347: }

1349: /******************************************************************************/
1350: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1351: {
1352:   PetscInt    size;
1353:   PetscInt    *in, *out;
1354:   PetscScalar *buf, *work;
1355:   PetscInt    op[] = {GL_ADD,0};

1357:   in   = gs->tree_map_in;
1358:   out  = gs->tree_map_out;
1359:   buf  = gs->tree_buf;
1360:   work = gs->tree_work;
1361:   size = gs->tree_nel;

1363:   PCTFS_rvec_zero(buf,size);

1365:   while (*in >= 0) *(buf + *out++) = *(vals + *in++);

1367:   in  = gs->tree_map_in;
1368:   out = gs->tree_map_out;

1370:   PCTFS_grop_hc(buf,work,size,op,dim);

1372:   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1373:   return 0;
1374: }