Actual source code: baijsolv.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
  5: {
  6:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
  7:   IS                iscol=a->col,isrow=a->row;
  8:   const PetscInt    *r,*c,*rout,*cout;
  9:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
 10:   PetscInt          i,nz;
 11:   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
 12:   const MatScalar   *aa=a->a,*v;
 13:   PetscScalar       *x,*s,*t,*ls;
 14:   const PetscScalar *b;

 16:   VecGetArrayRead(bb,&b);
 17:   VecGetArray(xx,&x);
 18:   t    = a->solve_work;

 20:   ISGetIndices(isrow,&rout); r = rout;
 21:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

 23:   /* forward solve the lower triangular */
 24:   PetscArraycpy(t,b+bs*(*r++),bs);
 25:   for (i=1; i<n; i++) {
 26:     v    = aa + bs2*ai[i];
 27:     vi   = aj + ai[i];
 28:     nz   = a->diag[i] - ai[i];
 29:     s    = t + bs*i;
 30:     PetscArraycpy(s,b+bs*(*r++),bs);
 31:     while (nz--) {
 32:       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
 33:       v += bs2;
 34:     }
 35:   }
 36:   /* backward solve the upper triangular */
 37:   ls = a->solve_work + A->cmap->n;
 38:   for (i=n-1; i>=0; i--) {
 39:     v    = aa + bs2*(a->diag[i] + 1);
 40:     vi   = aj + a->diag[i] + 1;
 41:     nz   = ai[i+1] - a->diag[i] - 1;
 42:     PetscArraycpy(ls,t+i*bs,bs);
 43:     while (nz--) {
 44:       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
 45:       v += bs2;
 46:     }
 47:     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
 48:     PetscArraycpy(x + bs*(*c--),t+i*bs,bs);
 49:   }

 51:   ISRestoreIndices(isrow,&rout);
 52:   ISRestoreIndices(iscol,&cout);
 53:   VecRestoreArrayRead(bb,&b);
 54:   VecRestoreArray(xx,&x);
 55:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
 56:   return 0;
 57: }

 59: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
 60: {
 61:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
 62:   IS                iscol=a->col,isrow=a->row;
 63:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
 64:   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
 65:   PetscInt          i,nz,idx,idt,idc;
 66:   const MatScalar   *aa=a->a,*v;
 67:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
 68:   const PetscScalar *b;

 70:   VecGetArrayRead(bb,&b);
 71:   VecGetArray(xx,&x);
 72:   t    = a->solve_work;

 74:   ISGetIndices(isrow,&rout); r = rout;
 75:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

 77:   /* forward solve the lower triangular */
 78:   idx  = 7*(*r++);
 79:   t[0] = b[idx];   t[1] = b[1+idx];
 80:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
 81:   t[5] = b[5+idx]; t[6] = b[6+idx];

 83:   for (i=1; i<n; i++) {
 84:     v   = aa + 49*ai[i];
 85:     vi  = aj + ai[i];
 86:     nz  = diag[i] - ai[i];
 87:     idx = 7*(*r++);
 88:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
 89:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
 90:     while (nz--) {
 91:       idx = 7*(*vi++);
 92:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
 93:       x4  = t[3+idx];x5 = t[4+idx];
 94:       x6  = t[5+idx];x7 = t[6+idx];
 95:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
 96:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
 97:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
 98:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
 99:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
100:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
101:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
102:       v  += 49;
103:     }
104:     idx      = 7*i;
105:     t[idx]   = s1;t[1+idx] = s2;
106:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
107:     t[5+idx] = s6;t[6+idx] = s7;
108:   }
109:   /* backward solve the upper triangular */
110:   for (i=n-1; i>=0; i--) {
111:     v   = aa + 49*diag[i] + 49;
112:     vi  = aj + diag[i] + 1;
113:     nz  = ai[i+1] - diag[i] - 1;
114:     idt = 7*i;
115:     s1  = t[idt];  s2 = t[1+idt];
116:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
117:     s6  = t[5+idt];s7 = t[6+idt];
118:     while (nz--) {
119:       idx = 7*(*vi++);
120:       x1  = t[idx];   x2 = t[1+idx];
121:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
122:       x6  = t[5+idx]; x7 = t[6+idx];
123:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
124:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
125:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
126:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
127:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
128:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
129:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
130:       v  += 49;
131:     }
132:     idc    = 7*(*c--);
133:     v      = aa + 49*diag[i];
134:     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
135:                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
136:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
137:                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
138:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
139:                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
140:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
141:                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
142:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
143:                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
144:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
145:                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
146:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
147:                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
148:   }

150:   ISRestoreIndices(isrow,&rout);
151:   ISRestoreIndices(iscol,&cout);
152:   VecRestoreArrayRead(bb,&b);
153:   VecRestoreArray(xx,&x);
154:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
155:   return 0;
156: }

158: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
159: {
160:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
161:   IS                iscol=a->col,isrow=a->row;
162:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
163:   const PetscInt    n=a->mbs,*rout,*cout,*vi;
164:   PetscInt          i,nz,idx,idt,idc,m;
165:   const MatScalar   *aa=a->a,*v;
166:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
167:   const PetscScalar *b;

169:   VecGetArrayRead(bb,&b);
170:   VecGetArray(xx,&x);
171:   t    = a->solve_work;

173:   ISGetIndices(isrow,&rout); r = rout;
174:   ISGetIndices(iscol,&cout); c = cout;

176:   /* forward solve the lower triangular */
177:   idx  = 7*r[0];
178:   t[0] = b[idx];   t[1] = b[1+idx];
179:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
180:   t[5] = b[5+idx]; t[6] = b[6+idx];

182:   for (i=1; i<n; i++) {
183:     v   = aa + 49*ai[i];
184:     vi  = aj + ai[i];
185:     nz  = ai[i+1] - ai[i];
186:     idx = 7*r[i];
187:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
188:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
189:     for (m=0; m<nz; m++) {
190:       idx = 7*vi[m];
191:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
192:       x4  = t[3+idx];x5 = t[4+idx];
193:       x6  = t[5+idx];x7 = t[6+idx];
194:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
195:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
196:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
197:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
198:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
199:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
200:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
201:       v  += 49;
202:     }
203:     idx      = 7*i;
204:     t[idx]   = s1;t[1+idx] = s2;
205:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
206:     t[5+idx] = s6;t[6+idx] = s7;
207:   }
208:   /* backward solve the upper triangular */
209:   for (i=n-1; i>=0; i--) {
210:     v   = aa + 49*(adiag[i+1]+1);
211:     vi  = aj + adiag[i+1]+1;
212:     nz  = adiag[i] - adiag[i+1] - 1;
213:     idt = 7*i;
214:     s1  = t[idt];  s2 = t[1+idt];
215:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
216:     s6  = t[5+idt];s7 = t[6+idt];
217:     for (m=0; m<nz; m++) {
218:       idx = 7*vi[m];
219:       x1  = t[idx];   x2 = t[1+idx];
220:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
221:       x6  = t[5+idx]; x7 = t[6+idx];
222:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
223:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
224:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
225:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
226:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
227:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
228:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
229:       v  += 49;
230:     }
231:     idc    = 7*c[i];
232:     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
233:                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
234:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
235:                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
236:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
237:                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
238:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
239:                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
240:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
241:                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
242:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
243:                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
244:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
245:                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
246:   }

248:   ISRestoreIndices(isrow,&rout);
249:   ISRestoreIndices(iscol,&cout);
250:   VecRestoreArrayRead(bb,&b);
251:   VecRestoreArray(xx,&x);
252:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
253:   return 0;
254: }

256: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
257: {
258:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
259:   IS                iscol=a->col,isrow=a->row;
260:   const PetscInt    *r,*c,*rout,*cout;
261:   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
262:   PetscInt          i,nz,idx,idt,idc;
263:   const MatScalar   *aa=a->a,*v;
264:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
265:   const PetscScalar *b;

267:   VecGetArrayRead(bb,&b);
268:   VecGetArray(xx,&x);
269:   t    = a->solve_work;

271:   ISGetIndices(isrow,&rout); r = rout;
272:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

274:   /* forward solve the lower triangular */
275:   idx  = 6*(*r++);
276:   t[0] = b[idx];   t[1] = b[1+idx];
277:   t[2] = b[2+idx]; t[3] = b[3+idx];
278:   t[4] = b[4+idx]; t[5] = b[5+idx];
279:   for (i=1; i<n; i++) {
280:     v   = aa + 36*ai[i];
281:     vi  = aj + ai[i];
282:     nz  = diag[i] - ai[i];
283:     idx = 6*(*r++);
284:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
285:     s5  = b[4+idx]; s6 = b[5+idx];
286:     while (nz--) {
287:       idx = 6*(*vi++);
288:       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
289:       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
290:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
291:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
292:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
293:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
294:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
295:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
296:       v  += 36;
297:     }
298:     idx      = 6*i;
299:     t[idx]   = s1;t[1+idx] = s2;
300:     t[2+idx] = s3;t[3+idx] = s4;
301:     t[4+idx] = s5;t[5+idx] = s6;
302:   }
303:   /* backward solve the upper triangular */
304:   for (i=n-1; i>=0; i--) {
305:     v   = aa + 36*diag[i] + 36;
306:     vi  = aj + diag[i] + 1;
307:     nz  = ai[i+1] - diag[i] - 1;
308:     idt = 6*i;
309:     s1  = t[idt];  s2 = t[1+idt];
310:     s3  = t[2+idt];s4 = t[3+idt];
311:     s5  = t[4+idt];s6 = t[5+idt];
312:     while (nz--) {
313:       idx = 6*(*vi++);
314:       x1  = t[idx];   x2 = t[1+idx];
315:       x3  = t[2+idx]; x4 = t[3+idx];
316:       x5  = t[4+idx]; x6 = t[5+idx];
317:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
318:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
319:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
320:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
321:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
322:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
323:       v  += 36;
324:     }
325:     idc    = 6*(*c--);
326:     v      = aa + 36*diag[i];
327:     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
328:                         v[18]*s4+v[24]*s5+v[30]*s6;
329:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
330:                           v[19]*s4+v[25]*s5+v[31]*s6;
331:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
332:                           v[20]*s4+v[26]*s5+v[32]*s6;
333:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
334:                           v[21]*s4+v[27]*s5+v[33]*s6;
335:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
336:                           v[22]*s4+v[28]*s5+v[34]*s6;
337:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
338:                           v[23]*s4+v[29]*s5+v[35]*s6;
339:   }

341:   ISRestoreIndices(isrow,&rout);
342:   ISRestoreIndices(iscol,&cout);
343:   VecRestoreArrayRead(bb,&b);
344:   VecRestoreArray(xx,&x);
345:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
346:   return 0;
347: }

349: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
350: {
351:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
352:   IS                iscol=a->col,isrow=a->row;
353:   const PetscInt    *r,*c,*rout,*cout;
354:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
355:   PetscInt          i,nz,idx,idt,idc,m;
356:   const MatScalar   *aa=a->a,*v;
357:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
358:   const PetscScalar *b;

360:   VecGetArrayRead(bb,&b);
361:   VecGetArray(xx,&x);
362:   t    = a->solve_work;

364:   ISGetIndices(isrow,&rout); r = rout;
365:   ISGetIndices(iscol,&cout); c = cout;

367:   /* forward solve the lower triangular */
368:   idx  = 6*r[0];
369:   t[0] = b[idx];   t[1] = b[1+idx];
370:   t[2] = b[2+idx]; t[3] = b[3+idx];
371:   t[4] = b[4+idx]; t[5] = b[5+idx];
372:   for (i=1; i<n; i++) {
373:     v   = aa + 36*ai[i];
374:     vi  = aj + ai[i];
375:     nz  = ai[i+1] - ai[i];
376:     idx = 6*r[i];
377:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
378:     s5  = b[4+idx]; s6 = b[5+idx];
379:     for (m=0; m<nz; m++) {
380:       idx = 6*vi[m];
381:       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
382:       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
383:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
384:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
385:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
386:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
387:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
388:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
389:       v  += 36;
390:     }
391:     idx      = 6*i;
392:     t[idx]   = s1;t[1+idx] = s2;
393:     t[2+idx] = s3;t[3+idx] = s4;
394:     t[4+idx] = s5;t[5+idx] = s6;
395:   }
396:   /* backward solve the upper triangular */
397:   for (i=n-1; i>=0; i--) {
398:     v   = aa + 36*(adiag[i+1]+1);
399:     vi  = aj + adiag[i+1]+1;
400:     nz  = adiag[i] - adiag[i+1] - 1;
401:     idt = 6*i;
402:     s1  = t[idt];  s2 = t[1+idt];
403:     s3  = t[2+idt];s4 = t[3+idt];
404:     s5  = t[4+idt];s6 = t[5+idt];
405:     for (m=0; m<nz; m++) {
406:       idx = 6*vi[m];
407:       x1  = t[idx];   x2 = t[1+idx];
408:       x3  = t[2+idx]; x4 = t[3+idx];
409:       x5  = t[4+idx]; x6 = t[5+idx];
410:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
411:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
412:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
413:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
414:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
415:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
416:       v  += 36;
417:     }
418:     idc    = 6*c[i];
419:     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
420:                         v[18]*s4+v[24]*s5+v[30]*s6;
421:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
422:                           v[19]*s4+v[25]*s5+v[31]*s6;
423:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
424:                           v[20]*s4+v[26]*s5+v[32]*s6;
425:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
426:                           v[21]*s4+v[27]*s5+v[33]*s6;
427:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
428:                           v[22]*s4+v[28]*s5+v[34]*s6;
429:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
430:                           v[23]*s4+v[29]*s5+v[35]*s6;
431:   }

433:   ISRestoreIndices(isrow,&rout);
434:   ISRestoreIndices(iscol,&cout);
435:   VecRestoreArrayRead(bb,&b);
436:   VecRestoreArray(xx,&x);
437:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
438:   return 0;
439: }

441: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
442: {
443:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
444:   IS                iscol=a->col,isrow=a->row;
445:   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
446:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
447:   PetscInt          i,nz,idx,idt,idc;
448:   const MatScalar   *aa=a->a,*v;
449:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
450:   const PetscScalar *b;

452:   VecGetArrayRead(bb,&b);
453:   VecGetArray(xx,&x);
454:   t    = a->solve_work;

456:   ISGetIndices(isrow,&rout); r = rout;
457:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

459:   /* forward solve the lower triangular */
460:   idx  = 5*(*r++);
461:   t[0] = b[idx];   t[1] = b[1+idx];
462:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
463:   for (i=1; i<n; i++) {
464:     v   = aa + 25*ai[i];
465:     vi  = aj + ai[i];
466:     nz  = diag[i] - ai[i];
467:     idx = 5*(*r++);
468:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
469:     s5  = b[4+idx];
470:     while (nz--) {
471:       idx = 5*(*vi++);
472:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
473:       x4  = t[3+idx];x5 = t[4+idx];
474:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
475:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
476:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
477:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
478:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
479:       v  += 25;
480:     }
481:     idx      = 5*i;
482:     t[idx]   = s1;t[1+idx] = s2;
483:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
484:   }
485:   /* backward solve the upper triangular */
486:   for (i=n-1; i>=0; i--) {
487:     v   = aa + 25*diag[i] + 25;
488:     vi  = aj + diag[i] + 1;
489:     nz  = ai[i+1] - diag[i] - 1;
490:     idt = 5*i;
491:     s1  = t[idt];  s2 = t[1+idt];
492:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
493:     while (nz--) {
494:       idx = 5*(*vi++);
495:       x1  = t[idx];   x2 = t[1+idx];
496:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
497:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
498:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
499:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
500:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
501:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
502:       v  += 25;
503:     }
504:     idc    = 5*(*c--);
505:     v      = aa + 25*diag[i];
506:     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
507:                         v[15]*s4+v[20]*s5;
508:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
509:                           v[16]*s4+v[21]*s5;
510:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
511:                           v[17]*s4+v[22]*s5;
512:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
513:                           v[18]*s4+v[23]*s5;
514:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
515:                           v[19]*s4+v[24]*s5;
516:   }

518:   ISRestoreIndices(isrow,&rout);
519:   ISRestoreIndices(iscol,&cout);
520:   VecRestoreArrayRead(bb,&b);
521:   VecRestoreArray(xx,&x);
522:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
523:   return 0;
524: }

526: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
527: {
528:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
529:   IS                iscol=a->col,isrow=a->row;
530:   const PetscInt    *r,*c,*rout,*cout;
531:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
532:   PetscInt          i,nz,idx,idt,idc,m;
533:   const MatScalar   *aa=a->a,*v;
534:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
535:   const PetscScalar *b;

537:   VecGetArrayRead(bb,&b);
538:   VecGetArray(xx,&x);
539:   t    = a->solve_work;

541:   ISGetIndices(isrow,&rout); r = rout;
542:   ISGetIndices(iscol,&cout); c = cout;

544:   /* forward solve the lower triangular */
545:   idx  = 5*r[0];
546:   t[0] = b[idx];   t[1] = b[1+idx];
547:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
548:   for (i=1; i<n; i++) {
549:     v   = aa + 25*ai[i];
550:     vi  = aj + ai[i];
551:     nz  = ai[i+1] - ai[i];
552:     idx = 5*r[i];
553:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
554:     s5  = b[4+idx];
555:     for (m=0; m<nz; m++) {
556:       idx = 5*vi[m];
557:       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
558:       x4  = t[3+idx];x5 = t[4+idx];
559:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
560:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
561:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
562:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
563:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
564:       v  += 25;
565:     }
566:     idx      = 5*i;
567:     t[idx]   = s1;t[1+idx] = s2;
568:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
569:   }
570:   /* backward solve the upper triangular */
571:   for (i=n-1; i>=0; i--) {
572:     v   = aa + 25*(adiag[i+1]+1);
573:     vi  = aj + adiag[i+1]+1;
574:     nz  = adiag[i] - adiag[i+1] - 1;
575:     idt = 5*i;
576:     s1  = t[idt];  s2 = t[1+idt];
577:     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
578:     for (m=0; m<nz; m++) {
579:       idx = 5*vi[m];
580:       x1  = t[idx];   x2 = t[1+idx];
581:       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
582:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
583:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
584:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
585:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
586:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
587:       v  += 25;
588:     }
589:     idc    = 5*c[i];
590:     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
591:                         v[15]*s4+v[20]*s5;
592:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
593:                           v[16]*s4+v[21]*s5;
594:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
595:                           v[17]*s4+v[22]*s5;
596:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
597:                           v[18]*s4+v[23]*s5;
598:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
599:                           v[19]*s4+v[24]*s5;
600:   }

602:   ISRestoreIndices(isrow,&rout);
603:   ISRestoreIndices(iscol,&cout);
604:   VecRestoreArrayRead(bb,&b);
605:   VecRestoreArray(xx,&x);
606:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
607:   return 0;
608: }

610: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
611: {
612:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
613:   IS                iscol=a->col,isrow=a->row;
614:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
615:   PetscInt          i,nz,idx,idt,idc;
616:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
617:   const MatScalar   *aa=a->a,*v;
618:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
619:   const PetscScalar *b;

621:   VecGetArrayRead(bb,&b);
622:   VecGetArray(xx,&x);
623:   t    = a->solve_work;

625:   ISGetIndices(isrow,&rout); r = rout;
626:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

628:   /* forward solve the lower triangular */
629:   idx  = 4*(*r++);
630:   t[0] = b[idx];   t[1] = b[1+idx];
631:   t[2] = b[2+idx]; t[3] = b[3+idx];
632:   for (i=1; i<n; i++) {
633:     v   = aa + 16*ai[i];
634:     vi  = aj + ai[i];
635:     nz  = diag[i] - ai[i];
636:     idx = 4*(*r++);
637:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
638:     while (nz--) {
639:       idx = 4*(*vi++);
640:       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
641:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
642:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
643:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
644:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
645:       v  += 16;
646:     }
647:     idx      = 4*i;
648:     t[idx]   = s1;t[1+idx] = s2;
649:     t[2+idx] = s3;t[3+idx] = s4;
650:   }
651:   /* backward solve the upper triangular */
652:   for (i=n-1; i>=0; i--) {
653:     v   = aa + 16*diag[i] + 16;
654:     vi  = aj + diag[i] + 1;
655:     nz  = ai[i+1] - diag[i] - 1;
656:     idt = 4*i;
657:     s1  = t[idt];  s2 = t[1+idt];
658:     s3  = t[2+idt];s4 = t[3+idt];
659:     while (nz--) {
660:       idx = 4*(*vi++);
661:       x1  = t[idx];   x2 = t[1+idx];
662:       x3  = t[2+idx]; x4 = t[3+idx];
663:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
664:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
665:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
666:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
667:       v  += 16;
668:     }
669:     idc      = 4*(*c--);
670:     v        = aa + 16*diag[i];
671:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
672:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
673:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
674:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
675:   }

677:   ISRestoreIndices(isrow,&rout);
678:   ISRestoreIndices(iscol,&cout);
679:   VecRestoreArrayRead(bb,&b);
680:   VecRestoreArray(xx,&x);
681:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
682:   return 0;
683: }

685: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
686: {
687:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
688:   IS                iscol=a->col,isrow=a->row;
689:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
690:   PetscInt          i,nz,idx,idt,idc,m;
691:   const PetscInt    *r,*c,*rout,*cout;
692:   const MatScalar   *aa=a->a,*v;
693:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
694:   const PetscScalar *b;

696:   VecGetArrayRead(bb,&b);
697:   VecGetArray(xx,&x);
698:   t    = a->solve_work;

700:   ISGetIndices(isrow,&rout); r = rout;
701:   ISGetIndices(iscol,&cout); c = cout;

703:   /* forward solve the lower triangular */
704:   idx  = 4*r[0];
705:   t[0] = b[idx];   t[1] = b[1+idx];
706:   t[2] = b[2+idx]; t[3] = b[3+idx];
707:   for (i=1; i<n; i++) {
708:     v   = aa + 16*ai[i];
709:     vi  = aj + ai[i];
710:     nz  = ai[i+1] - ai[i];
711:     idx = 4*r[i];
712:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
713:     for (m=0; m<nz; m++) {
714:       idx = 4*vi[m];
715:       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
716:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
717:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
718:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
719:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
720:       v  += 16;
721:     }
722:     idx      = 4*i;
723:     t[idx]   = s1;t[1+idx] = s2;
724:     t[2+idx] = s3;t[3+idx] = s4;
725:   }
726:   /* backward solve the upper triangular */
727:   for (i=n-1; i>=0; i--) {
728:     v   = aa + 16*(adiag[i+1]+1);
729:     vi  = aj + adiag[i+1]+1;
730:     nz  = adiag[i] - adiag[i+1] - 1;
731:     idt = 4*i;
732:     s1  = t[idt];  s2 = t[1+idt];
733:     s3  = t[2+idt];s4 = t[3+idt];
734:     for (m=0; m<nz; m++) {
735:       idx = 4*vi[m];
736:       x1  = t[idx];   x2 = t[1+idx];
737:       x3  = t[2+idx]; x4 = t[3+idx];
738:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
739:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
740:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
741:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
742:       v  += 16;
743:     }
744:     idc      = 4*c[i];
745:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
746:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
747:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
748:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
749:   }

751:   ISRestoreIndices(isrow,&rout);
752:   ISRestoreIndices(iscol,&cout);
753:   VecRestoreArrayRead(bb,&b);
754:   VecRestoreArray(xx,&x);
755:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
756:   return 0;
757: }

759: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
760: {
761:   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
762:   IS                iscol=a->col,isrow=a->row;
763:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
764:   PetscInt          i,nz,idx,idt,idc;
765:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
766:   const MatScalar   *aa=a->a,*v;
767:   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
768:   PetscScalar       *x;
769:   const PetscScalar *b;

771:   VecGetArrayRead(bb,&b);
772:   VecGetArray(xx,&x);
773:   t    = (MatScalar*)a->solve_work;

775:   ISGetIndices(isrow,&rout); r = rout;
776:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

778:   /* forward solve the lower triangular */
779:   idx  = 4*(*r++);
780:   t[0] = (MatScalar)b[idx];
781:   t[1] = (MatScalar)b[1+idx];
782:   t[2] = (MatScalar)b[2+idx];
783:   t[3] = (MatScalar)b[3+idx];
784:   for (i=1; i<n; i++) {
785:     v   = aa + 16*ai[i];
786:     vi  = aj + ai[i];
787:     nz  = diag[i] - ai[i];
788:     idx = 4*(*r++);
789:     s1  = (MatScalar)b[idx];
790:     s2  = (MatScalar)b[1+idx];
791:     s3  = (MatScalar)b[2+idx];
792:     s4  = (MatScalar)b[3+idx];
793:     while (nz--) {
794:       idx = 4*(*vi++);
795:       x1  = t[idx];
796:       x2  = t[1+idx];
797:       x3  = t[2+idx];
798:       x4  = t[3+idx];
799:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
800:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
801:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
802:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
803:       v  += 16;
804:     }
805:     idx      = 4*i;
806:     t[idx]   = s1;
807:     t[1+idx] = s2;
808:     t[2+idx] = s3;
809:     t[3+idx] = s4;
810:   }
811:   /* backward solve the upper triangular */
812:   for (i=n-1; i>=0; i--) {
813:     v   = aa + 16*diag[i] + 16;
814:     vi  = aj + diag[i] + 1;
815:     nz  = ai[i+1] - diag[i] - 1;
816:     idt = 4*i;
817:     s1  = t[idt];
818:     s2  = t[1+idt];
819:     s3  = t[2+idt];
820:     s4  = t[3+idt];
821:     while (nz--) {
822:       idx = 4*(*vi++);
823:       x1  = t[idx];
824:       x2  = t[1+idx];
825:       x3  = t[2+idx];
826:       x4  = t[3+idx];
827:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
828:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
829:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
830:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
831:       v  += 16;
832:     }
833:     idc      = 4*(*c--);
834:     v        = aa + 16*diag[i];
835:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
836:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
837:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
838:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
839:     x[idc]   = (PetscScalar)t[idt];
840:     x[1+idc] = (PetscScalar)t[1+idt];
841:     x[2+idc] = (PetscScalar)t[2+idt];
842:     x[3+idc] = (PetscScalar)t[3+idt];
843:   }

845:   ISRestoreIndices(isrow,&rout);
846:   ISRestoreIndices(iscol,&cout);
847:   VecRestoreArrayRead(bb,&b);
848:   VecRestoreArray(xx,&x);
849:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
850:   return 0;
851: }

853: #if defined(PETSC_HAVE_SSE)

855: #include PETSC_HAVE_SSE

857: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
858: {
859:   /*
860:      Note: This code uses demotion of double
861:      to float when performing the mixed-mode computation.
862:      This may not be numerically reasonable for all applications.
863:   */
864:   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
865:   IS             iscol=a->col,isrow=a->row;
866:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
867:   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
868:   MatScalar      *aa=a->a,*v;
869:   PetscScalar    *x,*b,*t;

871:   /* Make space in temp stack for 16 Byte Aligned arrays */
872:   float         ssealignedspace[11],*tmps,*tmpx;
873:   unsigned long offset;

875:   SSE_SCOPE_BEGIN;

877:   offset = (unsigned long)ssealignedspace % 16;
878:   if (offset) offset = (16 - offset)/4;
879:   tmps = &ssealignedspace[offset];
880:   tmpx = &ssealignedspace[offset+4];
881:   PREFETCH_NTA(aa+16*ai[1]);

883:   VecGetArray(bb,&b);
884:   VecGetArray(xx,&x);
885:   t    = a->solve_work;

887:   ISGetIndices(isrow,&rout); r = rout;
888:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

890:   /* forward solve the lower triangular */
891:   idx  = 4*(*r++);
892:   t[0] = b[idx];   t[1] = b[1+idx];
893:   t[2] = b[2+idx]; t[3] = b[3+idx];
894:   v    =  aa + 16*ai[1];

896:   for (i=1; i<n;) {
897:     PREFETCH_NTA(&v[8]);
898:     vi  =  aj      + ai[i];
899:     nz  =  diag[i] - ai[i];
900:     idx =  4*(*r++);

902:     /* Demote sum from double to float */
903:     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
904:     LOAD_PS(tmps,XMM7);

906:     while (nz--) {
907:       PREFETCH_NTA(&v[16]);
908:       idx = 4*(*vi++);

910:       /* Demote solution (so far) from double to float */
911:       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

913:       /* 4x4 Matrix-Vector product with negative accumulation: */
914:       SSE_INLINE_BEGIN_2(tmpx,v)
915:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

917:       /* First Column */
918:       SSE_COPY_PS(XMM0,XMM6)
919:       SSE_SHUFFLE(XMM0,XMM0,0x00)
920:       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
921:       SSE_SUB_PS(XMM7,XMM0)

923:       /* Second Column */
924:       SSE_COPY_PS(XMM1,XMM6)
925:       SSE_SHUFFLE(XMM1,XMM1,0x55)
926:       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
927:       SSE_SUB_PS(XMM7,XMM1)

929:       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

931:       /* Third Column */
932:       SSE_COPY_PS(XMM2,XMM6)
933:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
934:       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
935:       SSE_SUB_PS(XMM7,XMM2)

937:       /* Fourth Column */
938:       SSE_COPY_PS(XMM3,XMM6)
939:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
940:       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
941:       SSE_SUB_PS(XMM7,XMM3)
942:       SSE_INLINE_END_2

944:       v += 16;
945:     }
946:     idx = 4*i;
947:     v   = aa + 16*ai[++i];
948:     PREFETCH_NTA(v);
949:     STORE_PS(tmps,XMM7);

951:     /* Promote result from float to double */
952:     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
953:   }
954:   /* backward solve the upper triangular */
955:   idt  = 4*(n-1);
956:   ai16 = 16*diag[n-1];
957:   v    = aa + ai16 + 16;
958:   for (i=n-1; i>=0;) {
959:     PREFETCH_NTA(&v[8]);
960:     vi = aj + diag[i] + 1;
961:     nz = ai[i+1] - diag[i] - 1;

963:     /* Demote accumulator from double to float */
964:     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
965:     LOAD_PS(tmps,XMM7);

967:     while (nz--) {
968:       PREFETCH_NTA(&v[16]);
969:       idx = 4*(*vi++);

971:       /* Demote solution (so far) from double to float */
972:       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

974:       /* 4x4 Matrix-Vector Product with negative accumulation: */
975:       SSE_INLINE_BEGIN_2(tmpx,v)
976:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

978:       /* First Column */
979:       SSE_COPY_PS(XMM0,XMM6)
980:       SSE_SHUFFLE(XMM0,XMM0,0x00)
981:       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
982:       SSE_SUB_PS(XMM7,XMM0)

984:       /* Second Column */
985:       SSE_COPY_PS(XMM1,XMM6)
986:       SSE_SHUFFLE(XMM1,XMM1,0x55)
987:       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
988:       SSE_SUB_PS(XMM7,XMM1)

990:       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)

992:       /* Third Column */
993:       SSE_COPY_PS(XMM2,XMM6)
994:       SSE_SHUFFLE(XMM2,XMM2,0xAA)
995:       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
996:       SSE_SUB_PS(XMM7,XMM2)

998:       /* Fourth Column */
999:       SSE_COPY_PS(XMM3,XMM6)
1000:       SSE_SHUFFLE(XMM3,XMM3,0xFF)
1001:       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1002:       SSE_SUB_PS(XMM7,XMM3)
1003:       SSE_INLINE_END_2
1004:       v += 16;
1005:     }
1006:     v    = aa + ai16;
1007:     ai16 = 16*diag[--i];
1008:     PREFETCH_NTA(aa+ai16+16);
1009:     /*
1010:        Scale the result by the diagonal 4x4 block,
1011:        which was inverted as part of the factorization
1012:     */
1013:     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1014:     /* First Column */
1015:     SSE_COPY_PS(XMM0,XMM7)
1016:     SSE_SHUFFLE(XMM0,XMM0,0x00)
1017:     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1019:     /* Second Column */
1020:     SSE_COPY_PS(XMM1,XMM7)
1021:     SSE_SHUFFLE(XMM1,XMM1,0x55)
1022:     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1023:     SSE_ADD_PS(XMM0,XMM1)

1025:     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)

1027:     /* Third Column */
1028:     SSE_COPY_PS(XMM2,XMM7)
1029:     SSE_SHUFFLE(XMM2,XMM2,0xAA)
1030:     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1031:     SSE_ADD_PS(XMM0,XMM2)

1033:     /* Fourth Column */
1034:     SSE_COPY_PS(XMM3,XMM7)
1035:     SSE_SHUFFLE(XMM3,XMM3,0xFF)
1036:     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1037:     SSE_ADD_PS(XMM0,XMM3)

1039:     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1040:     SSE_INLINE_END_3

1042:     /* Promote solution from float to double */
1043:     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

1045:     /* Apply reordering to t and stream into x.    */
1046:     /* This way, x doesn't pollute the cache.      */
1047:     /* Be careful with size: 2 doubles = 4 floats! */
1048:     idc = 4*(*c--);
1049:     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1050:     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1051:     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1052:     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1053:     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1054:     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1055:     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1056:     SSE_INLINE_END_2
1057:     v    = aa + ai16 + 16;
1058:     idt -= 4;
1059:   }

1061:   ISRestoreIndices(isrow,&rout);
1062:   ISRestoreIndices(iscol,&cout);
1063:   VecRestoreArray(bb,&b);
1064:   VecRestoreArray(xx,&x);
1065:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1066:   SSE_SCOPE_END;
1067:   return 0;
1068: }

1070: #endif

1072: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1073: {
1074:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1075:   IS                iscol=a->col,isrow=a->row;
1076:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1077:   PetscInt          i,nz,idx,idt,idc;
1078:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1079:   const MatScalar   *aa=a->a,*v;
1080:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1081:   const PetscScalar *b;

1083:   VecGetArrayRead(bb,&b);
1084:   VecGetArray(xx,&x);
1085:   t    = a->solve_work;

1087:   ISGetIndices(isrow,&rout); r = rout;
1088:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

1090:   /* forward solve the lower triangular */
1091:   idx  = 3*(*r++);
1092:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1093:   for (i=1; i<n; i++) {
1094:     v   = aa + 9*ai[i];
1095:     vi  = aj + ai[i];
1096:     nz  = diag[i] - ai[i];
1097:     idx = 3*(*r++);
1098:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1099:     while (nz--) {
1100:       idx = 3*(*vi++);
1101:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1102:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1103:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1104:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1105:       v  += 9;
1106:     }
1107:     idx    = 3*i;
1108:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1109:   }
1110:   /* backward solve the upper triangular */
1111:   for (i=n-1; i>=0; i--) {
1112:     v   = aa + 9*diag[i] + 9;
1113:     vi  = aj + diag[i] + 1;
1114:     nz  = ai[i+1] - diag[i] - 1;
1115:     idt = 3*i;
1116:     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1117:     while (nz--) {
1118:       idx = 3*(*vi++);
1119:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1120:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1121:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1122:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1123:       v  += 9;
1124:     }
1125:     idc      = 3*(*c--);
1126:     v        = aa + 9*diag[i];
1127:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1128:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1129:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1130:   }
1131:   ISRestoreIndices(isrow,&rout);
1132:   ISRestoreIndices(iscol,&cout);
1133:   VecRestoreArrayRead(bb,&b);
1134:   VecRestoreArray(xx,&x);
1135:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1136:   return 0;
1137: }

1139: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1140: {
1141:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1142:   IS                iscol=a->col,isrow=a->row;
1143:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1144:   PetscInt          i,nz,idx,idt,idc,m;
1145:   const PetscInt    *r,*c,*rout,*cout;
1146:   const MatScalar   *aa=a->a,*v;
1147:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
1148:   const PetscScalar *b;

1150:   VecGetArrayRead(bb,&b);
1151:   VecGetArray(xx,&x);
1152:   t    = a->solve_work;

1154:   ISGetIndices(isrow,&rout); r = rout;
1155:   ISGetIndices(iscol,&cout); c = cout;

1157:   /* forward solve the lower triangular */
1158:   idx  = 3*r[0];
1159:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1160:   for (i=1; i<n; i++) {
1161:     v   = aa + 9*ai[i];
1162:     vi  = aj + ai[i];
1163:     nz  = ai[i+1] - ai[i];
1164:     idx = 3*r[i];
1165:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1166:     for (m=0; m<nz; m++) {
1167:       idx = 3*vi[m];
1168:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1169:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1170:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1171:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1172:       v  += 9;
1173:     }
1174:     idx    = 3*i;
1175:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1176:   }
1177:   /* backward solve the upper triangular */
1178:   for (i=n-1; i>=0; i--) {
1179:     v   = aa + 9*(adiag[i+1]+1);
1180:     vi  = aj + adiag[i+1]+1;
1181:     nz  = adiag[i] - adiag[i+1] - 1;
1182:     idt = 3*i;
1183:     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1184:     for (m=0; m<nz; m++) {
1185:       idx = 3*vi[m];
1186:       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1187:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1188:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1189:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1190:       v  += 9;
1191:     }
1192:     idc      = 3*c[i];
1193:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1194:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1195:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1196:   }
1197:   ISRestoreIndices(isrow,&rout);
1198:   ISRestoreIndices(iscol,&cout);
1199:   VecRestoreArrayRead(bb,&b);
1200:   VecRestoreArray(xx,&x);
1201:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1202:   return 0;
1203: }

1205: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1206: {
1207:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1208:   IS                iscol=a->col,isrow=a->row;
1209:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1210:   PetscInt          i,nz,idx,idt,idc;
1211:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1212:   const MatScalar   *aa=a->a,*v;
1213:   PetscScalar       *x,s1,s2,x1,x2,*t;
1214:   const PetscScalar *b;

1216:   VecGetArrayRead(bb,&b);
1217:   VecGetArray(xx,&x);
1218:   t    = a->solve_work;

1220:   ISGetIndices(isrow,&rout); r = rout;
1221:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

1223:   /* forward solve the lower triangular */
1224:   idx  = 2*(*r++);
1225:   t[0] = b[idx]; t[1] = b[1+idx];
1226:   for (i=1; i<n; i++) {
1227:     v   = aa + 4*ai[i];
1228:     vi  = aj + ai[i];
1229:     nz  = diag[i] - ai[i];
1230:     idx = 2*(*r++);
1231:     s1  = b[idx]; s2 = b[1+idx];
1232:     while (nz--) {
1233:       idx = 2*(*vi++);
1234:       x1  = t[idx]; x2 = t[1+idx];
1235:       s1 -= v[0]*x1 + v[2]*x2;
1236:       s2 -= v[1]*x1 + v[3]*x2;
1237:       v  += 4;
1238:     }
1239:     idx    = 2*i;
1240:     t[idx] = s1; t[1+idx] = s2;
1241:   }
1242:   /* backward solve the upper triangular */
1243:   for (i=n-1; i>=0; i--) {
1244:     v   = aa + 4*diag[i] + 4;
1245:     vi  = aj + diag[i] + 1;
1246:     nz  = ai[i+1] - diag[i] - 1;
1247:     idt = 2*i;
1248:     s1  = t[idt]; s2 = t[1+idt];
1249:     while (nz--) {
1250:       idx = 2*(*vi++);
1251:       x1  = t[idx]; x2 = t[1+idx];
1252:       s1 -= v[0]*x1 + v[2]*x2;
1253:       s2 -= v[1]*x1 + v[3]*x2;
1254:       v  += 4;
1255:     }
1256:     idc      = 2*(*c--);
1257:     v        = aa + 4*diag[i];
1258:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1259:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1260:   }
1261:   ISRestoreIndices(isrow,&rout);
1262:   ISRestoreIndices(iscol,&cout);
1263:   VecRestoreArrayRead(bb,&b);
1264:   VecRestoreArray(xx,&x);
1265:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1266:   return 0;
1267: }

1269: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1270: {
1271:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1272:   IS                iscol=a->col,isrow=a->row;
1273:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1274:   PetscInt          i,nz,idx,jdx,idt,idc,m;
1275:   const PetscInt    *r,*c,*rout,*cout;
1276:   const MatScalar   *aa=a->a,*v;
1277:   PetscScalar       *x,s1,s2,x1,x2,*t;
1278:   const PetscScalar *b;

1280:   VecGetArrayRead(bb,&b);
1281:   VecGetArray(xx,&x);
1282:   t    = a->solve_work;

1284:   ISGetIndices(isrow,&rout); r = rout;
1285:   ISGetIndices(iscol,&cout); c = cout;

1287:   /* forward solve the lower triangular */
1288:   idx  = 2*r[0];
1289:   t[0] = b[idx]; t[1] = b[1+idx];
1290:   for (i=1; i<n; i++) {
1291:     v   = aa + 4*ai[i];
1292:     vi  = aj + ai[i];
1293:     nz  = ai[i+1] - ai[i];
1294:     idx = 2*r[i];
1295:     s1  = b[idx]; s2 = b[1+idx];
1296:     for (m=0; m<nz; m++) {
1297:       jdx = 2*vi[m];
1298:       x1  = t[jdx]; x2 = t[1+jdx];
1299:       s1 -= v[0]*x1 + v[2]*x2;
1300:       s2 -= v[1]*x1 + v[3]*x2;
1301:       v  += 4;
1302:     }
1303:     idx    = 2*i;
1304:     t[idx] = s1; t[1+idx] = s2;
1305:   }
1306:   /* backward solve the upper triangular */
1307:   for (i=n-1; i>=0; i--) {
1308:     v   = aa + 4*(adiag[i+1]+1);
1309:     vi  = aj + adiag[i+1]+1;
1310:     nz  = adiag[i] - adiag[i+1] - 1;
1311:     idt = 2*i;
1312:     s1  = t[idt]; s2 = t[1+idt];
1313:     for (m=0; m<nz; m++) {
1314:       idx = 2*vi[m];
1315:       x1  = t[idx]; x2 = t[1+idx];
1316:       s1 -= v[0]*x1 + v[2]*x2;
1317:       s2 -= v[1]*x1 + v[3]*x2;
1318:       v  += 4;
1319:     }
1320:     idc      = 2*c[i];
1321:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1322:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1323:   }
1324:   ISRestoreIndices(isrow,&rout);
1325:   ISRestoreIndices(iscol,&cout);
1326:   VecRestoreArrayRead(bb,&b);
1327:   VecRestoreArray(xx,&x);
1328:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1329:   return 0;
1330: }

1332: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1333: {
1334:   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
1335:   IS                iscol=a->col,isrow=a->row;
1336:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1337:   PetscInt          i,nz;
1338:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1339:   const MatScalar   *aa=a->a,*v;
1340:   PetscScalar       *x,s1,*t;
1341:   const PetscScalar *b;

1343:   if (!n) return 0;

1345:   VecGetArrayRead(bb,&b);
1346:   VecGetArray(xx,&x);
1347:   t    = a->solve_work;

1349:   ISGetIndices(isrow,&rout); r = rout;
1350:   ISGetIndices(iscol,&cout)); c = cout + (n-1;

1352:   /* forward solve the lower triangular */
1353:   t[0] = b[*r++];
1354:   for (i=1; i<n; i++) {
1355:     v  = aa + ai[i];
1356:     vi = aj + ai[i];
1357:     nz = diag[i] - ai[i];
1358:     s1 = b[*r++];
1359:     while (nz--) {
1360:       s1 -= (*v++)*t[*vi++];
1361:     }
1362:     t[i] = s1;
1363:   }
1364:   /* backward solve the upper triangular */
1365:   for (i=n-1; i>=0; i--) {
1366:     v  = aa + diag[i] + 1;
1367:     vi = aj + diag[i] + 1;
1368:     nz = ai[i+1] - diag[i] - 1;
1369:     s1 = t[i];
1370:     while (nz--) {
1371:       s1 -= (*v++)*t[*vi++];
1372:     }
1373:     x[*c--] = t[i] = aa[diag[i]]*s1;
1374:   }

1376:   ISRestoreIndices(isrow,&rout);
1377:   ISRestoreIndices(iscol,&cout);
1378:   VecRestoreArrayRead(bb,&b);
1379:   VecRestoreArray(xx,&x);
1380:   PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
1381:   return 0;
1382: }

1384: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1385: {
1386:   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
1387:   IS                iscol = a->col,isrow = a->row;
1388:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1389:   const PetscInt    *rout,*cout,*r,*c;
1390:   PetscScalar       *x,*tmp,sum;
1391:   const PetscScalar *b;
1392:   const MatScalar   *aa = a->a,*v;

1394:   if (!n) return 0;

1396:   VecGetArrayRead(bb,&b);
1397:   VecGetArray(xx,&x);
1398:   tmp  = a->solve_work;

1400:   ISGetIndices(isrow,&rout); r = rout;
1401:   ISGetIndices(iscol,&cout); c = cout;

1403:   /* forward solve the lower triangular */
1404:   tmp[0] = b[r[0]];
1405:   v      = aa;
1406:   vi     = aj;
1407:   for (i=1; i<n; i++) {
1408:     nz  = ai[i+1] - ai[i];
1409:     sum = b[r[i]];
1410:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1411:     tmp[i] = sum;
1412:     v     += nz; vi += nz;
1413:   }

1415:   /* backward solve the upper triangular */
1416:   for (i=n-1; i>=0; i--) {
1417:     v   = aa + adiag[i+1]+1;
1418:     vi  = aj + adiag[i+1]+1;
1419:     nz  = adiag[i]-adiag[i+1]-1;
1420:     sum = tmp[i];
1421:     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1422:     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1423:   }

1425:   ISRestoreIndices(isrow,&rout);
1426:   ISRestoreIndices(iscol,&cout);
1427:   VecRestoreArrayRead(bb,&b);
1428:   VecRestoreArray(xx,&x);
1429:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1430:   return 0;
1431: }