multigrid.cpp 128 KB
Newer Older
Christopher Subich's avatar
Christopher Subich committed
1
2
3
4
5
6
7
8
#include "multigrid.hpp"
#include "Par_util.hpp"

#include <blitz/array.h>
#include <iostream>
#include <blitz/tinyvec-et.h>

#include "umfpack.h"
9
#include "timing.hpp"
Christopher Subich's avatar
Christopher Subich committed
10
11
12
13

using namespace blitz;
using namespace std;

14

Christopher Subich's avatar
Christopher Subich committed
15
//#define COARSE_GRID_SIZE 512
16
#define COARSE_GRID_SIZE 8
Christopher Subich's avatar
Christopher Subich committed
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#define SYNC(__x__) { for (int _i = 0; _i < nproc; _i++) { \
                     if (myrank == _i) { \
                        __x__; \
                     } MPI_Barrier(my_comm); } MPI_Barrier(my_comm);}

/* Prototype the DGESV LAPACK routine */
extern "C" {
   void dgesv_(int * n, int * nrhs, double * a, int * lda, 
         int * ipiv, double * b, int * ldb, int * info);
   void dgbsv_(int * n, int * kl, int * ku, int * nrhs,
         double * ab, int * ldab, int * ipiv, double * b,
         int * ldb, int * info);
}

void get_fd_operator(Array<double,1> & x, Array<double,2> & Dx, Array<double,2> & Dxx) {
   /* Builds FD derivative operators (to second order, save at boundaries)
      for the first and second derivatives, given an arbitrary grid x */
   /* For output, Dx(i,j) will contain the operator local to x(i).
      So Dx*f @ x=j == Dx(i,0)*f(j-1) + Dx(i,1)*f(j) + Dx(i,2)*f(j+1) */

   assert(x.lbound(firstDim) == 0 && x.extent(firstDim) >= 2);
   
   /* Assert that x and operator extents are the same */
   assert(x.lbound(firstDim) == Dx.lbound(firstDim) &&
         x.ubound(firstDim) == Dx.ubound(firstDim));
   assert(x.lbound(firstDim) == Dxx.lbound(firstDim) &&
         x.ubound(firstDim) == Dxx.ubound(firstDim));

   /* Assert that the operator bounds are proper */
   assert(Dx.lbound(secondDim) == 0 && Dx.ubound(secondDim) == 2 &&
         Dxx.lbound(secondDim) == 0 && Dxx.ubound(secondDim) == 2);

   /* The first point (boundary) is easy, since it's one-sided differencing,
      and Dxx does not exist */
   int lbound = 0; int ubound = x.ubound(firstDim);

   Dx(lbound,0) = 0; // No reference to the left of the leftmost boundary
   Dx(lbound,1) = -1/(x(lbound+1)-x(lbound));
   Dx(lbound,2) = 1/(x(lbound+1)-x(lbound));
   Dx(ubound,2) = 0; // And none to the right of the rightmost boundary
   Dx(ubound,1) = 1/(x(ubound)-x(ubound-1));
   Dx(ubound,0) = -1/(x(ubound)-x(ubound-1));
   // No Dxx at the boundaries
   Dxx(lbound,0) = Dxx(lbound,1) = Dxx(lbound,2) = 0;
   Dxx(ubound,0) = Dxx(ubound,1) = Dxx(ubound,2) = 0;
   
   /* Now, loop over interior points */
   /* Matrix operator to solve for FD coefficients */
   Array<double,2> M(3,3,blitz::columnMajorArray);
   /* "vector" for Dx, Dxx coefficients.  Input is [0 1 0;0 0 1] and output is
      the coefficients for Dx/Dxx */
   Array<double,2> vec(3,2,blitz::columnMajorArray);
   int IPIV[3]; // Pivot array for DGESV
   /* Other constants for DGESV */
   int N=3, NRHS=2, LDA=3, LDB=3, INFO=-1;
   for (int j = lbound+1; j<= ubound-1; j++) {
      /* First, build M using a Taylor series approach.  We'll define
         the constant, dx, and dxx terms via differences on x, and then
         solve for the weights that give [0 1 0] for Dx and [0 0 1] for Dxx */
      /* Left and right spacings */
      double xl = x(j-1)-x(j);
      double xr = x(j+1)-x(j);
      /* Now, build the Taylor series about x=x(j), and evaluate
         at j-1, j, and j+1 */
      /* f(x) = f(x_j) + f_x(x_j)*(x-x_j) + f_xx(x_j)*(x-x_j)^2/2 */
      /* First row of matrix -- constant terms */
      M(0,0) = M(0,1) = M(0,2) = 1;
      /* Second row -- Dx term */
      M(1,0) = xl; M(1,1) = 0; M(1,2) = xr;
      /* Third row, Dxx term */
      M(2,0) = xl*xl/2, M(2,1) = 0; M(2,2) = xr*xr/2;

      /* Now, the rhs vector */
      vec(0,0) = vec(0,1) = 0;
      vec(1,0) = 1; vec(1,1) = 0;
      vec(2,0) = 0; vec(2,1) = 1;

      /* Now, M\vec, AKA an ugly DGESV call */
//      std::cerr << M;
      dgesv_(&N, &NRHS, M.data(), &LDA, IPIV, vec.data(), &LDB, &INFO);
      assert(INFO == 0);

//      std::cerr << vec;

      /* And assign results to Dx/Dxx */
      Dx(j,0) = vec(0,0); Dxx(j,0) = vec(0,1);
      Dx(j,1) = vec(1,0); Dxx(j,1) = vec(1,1);
      Dx(j,2) = vec(2,0); Dxx(j,2) = vec(2,1);
   }
}

void get_local_split(int extent, int rank, int nproc, int & l_lbound, int & l_ubound) {
   /* Allocate evenly-split pairs to as many processors as possible.
      An odd number of points means that the last processor gets an extra */
   int npairs = extent/2;
   bool odd = extent % 2;
   // Each processor will get at least base_number pairs
   int base_number = npairs / nproc;
   int extra_pairs = npairs % nproc;
   // If extent < 2*nproc, not every processor will get points
   int last_processor = -1;
   // If there are more than enough pairs to go around, each processor gets some
   if (npairs >= nproc) {
      last_processor = nproc - 1;
   // Otherwise, only extra pairs are distributed
   } else {
      last_processor = extra_pairs - 1;
   }
   // Processors 0-(extra_pairs-1) will have a bonus pairing
   l_lbound = 2*(base_number*rank + (extra_pairs > rank ? rank : extra_pairs));
   l_ubound = l_lbound + 2*(base_number + (extra_pairs > rank))-1;
   if (odd && rank == last_processor)
      l_ubound += 1;
}

/* Rebalance an array such that the global structure is preserved, but
   orig and balance might have different splittings.  The assumptions here
   are fairly general, but we will require that orig/balance both have
   increasing lbound/ubounds over the processors and are non-overlapping */

/* As a performance note, this can be split into two functions -- the first
   constructs the mapping of communication (who talks to whom), and the
   second actually performs the communication from this mapping.  For any
   given case, we're probably going to re-use the balancing several times */
#define MIN(x,y) ((x)<(y) ? (x) : (y))
#define MAX(x,y) ((x)<(y) ? (y) : (x))
      
144
void MG_Solver::rebalance_line(Array<double,1> & orig, Array<double,1> & balance, MPI_Comm c, BalanceGroup btype) {
Christopher Subich's avatar
Christopher Subich committed
145
146
147
148
149
150
151
152
153
154
155
   TinyVector<int,2> o_base, o_extent;
   o_base(0) = orig.lbound(firstDim); o_extent(0) = orig.extent(firstDim);
   o_base(1) = 0; o_extent(1) = 1;
//   Array<double,2> o_2d(Range(orig.lbound(firstDim),orig.ubound(firstDim)),Range(0,0));   
   Array<double,2> o_2d(o_base,o_extent);
   TinyVector<int,2> b_base, b_extent;
   b_base(0) = balance.lbound(firstDim); b_extent(0) = balance.extent(firstDim);
   b_base(1) = 0; b_extent(1) = 1;
//   Array<double,2> b_2d(Range(balance.lbound(firstDim),balance.ubound(firstDim)),Range(0,0));
   Array<double,2> b_2d(b_base,b_extent);
   o_2d(Range::all(),0) = orig;
156
   rebalance_array(o_2d,b_2d,c,btype);
Christopher Subich's avatar
Christopher Subich committed
157
158
   if (balance.extent(firstDim) > 0) balance = b_2d(Range::all(),0);
}
159
void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balance, MPI_Comm c, BalanceGroup btype) {
160
   timing_push("rebalance");
Christopher Subich's avatar
Christopher Subich committed
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
   int o_lbound = orig.lbound(firstDim);
   int o_ubound = orig.ubound(firstDim);
   int b_lbound = balance.lbound(firstDim);
   int b_ubound = balance.ubound(firstDim);
   int size_z = orig.extent(secondDim);
   int myrank, nproc;
   double * orig_data; // Pointer to the original data
   static Array<double,2> * temp_array = 0;
   // Now, we need to ensure that orig and balance have the same storage ordering
   if (!all(orig.ordering() == balance.ordering())) {
      GeneralArrayStorage<2> bal_storage(balance.ordering(),true);
      if (temp_array && (!all(temp_array->lbound() == o_lbound) ||
               !all(temp_array->ubound() == o_ubound) ||
               !all(temp_array->ordering() == balance.ordering()))) {
         delete temp_array;
         temp_array = 0;
      }
      if (!temp_array)
         temp_array = new Array<double,2> (orig.lbound(),orig.extent(),bal_storage);
      *temp_array = orig;
      orig_data = temp_array->data();
   } else {
      orig_data = orig.data();
   }
      
   MPI_Comm_size(c,&nproc);
   MPI_Comm_rank(c,&myrank);
188
   //fprintf(stderr,"Rebalancing array, %d/%d\n",myrank+1,nproc);
Christopher Subich's avatar
Christopher Subich committed
189
   /* Construct a view of the array bounds shared by all processors */
190
191
192
193
194
195
196
197
198
199
200
201
202
   int *o_lbounds, *o_ubounds, *b_lbounds, *b_ubounds;
   if (!rebalance_setup[btype]) {
      MPI_Allgather(&o_lbound,1,MPI_INT,to_lbounds[btype],1,MPI_INT,c);
      MPI_Allgather(&o_ubound,1,MPI_INT,to_ubounds[btype],1,MPI_INT,c);
      MPI_Allgather(&b_lbound,1,MPI_INT,tb_lbounds[btype],1,MPI_INT,c);
      MPI_Allgather(&b_ubound,1,MPI_INT,tb_ubounds[btype],1,MPI_INT,c);
      rebalance_setup[btype] = true;
   }
   o_lbounds = to_lbounds[btype];
   o_ubounds = to_ubounds[btype];
   b_lbounds = tb_lbounds[btype];
   b_ubounds = tb_ubounds[btype];
   //fprintf(stderr,"Rebalancng array with global dimensions %d-%d\n",o_lbounds[0],o_ubounds[nproc-1]);
Christopher Subich's avatar
Christopher Subich committed
203
204
205

   /* Now, construct the mapping of counts and displacements by processor */
   int s_counts[nproc], s_displs[nproc], r_counts[nproc], r_displs[nproc];
206
   int s_total = 0, r_total = 0;
Christopher Subich's avatar
Christopher Subich committed
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
   for (int k = 0; k < nproc; k++) {
      /* If our orig array overlaps their balance array... */
      if (o_lbound <= b_ubounds[k] && o_ubound >= b_lbounds[k]) {
         /* We want to send the region from the lowest to highest points
            of overlap */
         /* The lowest possible index we can send is our lower bound, and
            the highest possible we want to start sending is the recipient's
            lower bound */
         int send_lb = MAX(b_lbounds[k],o_lbound);
         /* Reversing the logic of above, the highest possible index to
            send is our upper bound, and the highest we want to send is
            the recipient's upper bound */
         int send_ub = MIN(o_ubound,b_ubounds[k]);
         s_counts[k] = size_z*(1+send_ub-send_lb);
         s_displs[k] = (send_lb-o_lbound)*size_z;
222
         s_total += s_counts[k];
Christopher Subich's avatar
Christopher Subich committed
223
224
225
226
227
228
229
230
231
      } else {
         s_counts[k] = s_displs[k] = 0;
      }
      /* And for receive, if our destination array overlaps their orig array... */
      if (b_lbound <= o_ubounds[k] && b_ubound >= o_lbounds[k]) {
         int rec_lb = MAX(b_lbound,o_lbounds[k]);
         int rec_ub = MIN(b_ubound,o_ubounds[k]);
         r_counts[k] = size_z*(1+rec_ub-rec_lb);
         r_displs[k] = (rec_lb-b_lbound)*size_z;
232
         r_total += r_counts[k];
Christopher Subich's avatar
Christopher Subich committed
233
234
235
236
237
238
239
240
241
242
243
244
245
      } else {
         r_counts[k] = r_displs[k] = 0;
      }
   }

   // Finally, call MPI_Alltoallv to perform the balancing
#if 0 // debug output
   for (int qq = 0; qq < nproc; qq++) {
      if (qq == myrank) {
         fprintf(stderr,"%d: ",myrank);
         for (int k = 0; k < nproc; k++) {
            fprintf(stderr,"%d/%d ",s_counts[k],r_counts[k]);
         }
246
         fprintf(stderr," -- %d+%d // %d+%d\n",orig.size(),s_total,balance.size(),r_total);
Christopher Subich's avatar
Christopher Subich committed
247
      }
248
      MPI_Barrier(c);
Christopher Subich's avatar
Christopher Subich committed
249
250
   }
#endif
251
252

   assert(orig.size() == s_total && balance.size() == r_total);
Christopher Subich's avatar
Christopher Subich committed
253
   
254
   timing_push("rebalance_mpi");
Christopher Subich's avatar
Christopher Subich committed
255
256
   MPI_Alltoallv(orig_data,s_counts,s_displs,MPI_DOUBLE,
         balance.data(),r_counts,r_displs,MPI_DOUBLE,c);
257
258
   timing_pop(); // rebalance_mpi
   timing_pop(); // rebalance
Christopher Subich's avatar
Christopher Subich committed
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
   // And done
}
void MG_Solver::set_x_symmetry(SYM_TYPE s) {
   // If previous symmetry type was different, then the numeric factor is bad
   if (symmetry_type != s) coarse_numeric_ok = false;
   symmetry_type = s;
   // Propagate change to the coarse operator
   if (coarse_solver) coarse_solver->set_x_symmetry(s);

   // And recheck BC consistency, since an odd->even change can make this
   // an indefinite problem
   if (coarsest_level)
      check_bc_consistency();
}
/* Construct local arrays, resizing the blitz::Arrays to their proper size */
MG_Solver::MG_Solver(Array<double,1> xvals, blitz::Array<double,1> zvals,
      SYM_TYPE sym, MPI_Comm c):
   coarse_solver(0), coarse_x_lbound(0), coarse_x_ubound(-1),
   coarse_symbolic_ok(false), coarse_numeric_ok(false), any_dxz(false),
   bc_tangent(false), bc_normal(false), numeric_factor(0), symbolic_factor(0),
   sparse_size(0), A_rows(), A_cols(), A_double(),
280
   coarsest_level(false), indefinite_problem(false)
Christopher Subich's avatar
Christopher Subich committed
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
{
   my_comm = c;
   symmetry_type = sym;
   MPI_Comm_size(c,&nproc); MPI_Comm_rank(c,&myrank);
   /* Get the local sizings, and find out what section of the array we're
      actually responsible for */
   size_x = xvals.extent(firstDim) - (sym?2:0);
   size_z = zvals.extent(firstDim);
   get_local_split(size_x,myrank,nproc,local_x_lbound,local_x_ubound);
   local_size_x = local_x_ubound - local_x_lbound +1;

   /* Resize and re-index the arrays, so that they point to the local section
      of the global operator space */
   TinyVector<int,2> base_vector; 
   base_vector(0) = local_x_lbound;
   base_vector(1) = 0;
   uxx.resize(local_size_x,size_z); uxx.reindexSelf(base_vector);
   uzz.resize(local_size_x,size_z); uzz.reindexSelf(base_vector);
   uxz.resize(local_size_x,size_z); uxz.reindexSelf(base_vector);
   ux.resize(local_size_x,size_z); ux.reindexSelf(base_vector);
   uz.resize(local_size_x,size_z); uz.reindexSelf(base_vector);

   /* Resize Dx/Dxx to include only the local part of the tensor product */
   Dx.resize(local_size_x,3); Dx.reindexSelf(base_vector);
   Dxx.resize(local_size_x,3); Dxx.reindexSelf(base_vector);

   /* Top and bottom BCs need resized in the same manner */
   TinyVector<int,1> base_1d; base_1d(0) = local_x_lbound;
309
310
311
312
313
314
   u_bot.resize(local_size_x); u_bot.reindexSelf(base_1d); u_bot=0;
   ux_bot.resize(local_size_x); ux_bot.reindexSelf(base_1d); ux_bot=0;
   uz_bot.resize(local_size_x); uz_bot.reindexSelf(base_1d); uz_bot=0;
   u_top.resize(local_size_x); u_top.reindexSelf(base_1d); u_top=0;
   ux_top.resize(local_size_x); ux_top.reindexSelf(base_1d); ux_top=0;
   uz_top.resize(local_size_x); uz_top.reindexSelf(base_1d); uz_top=0;
Christopher Subich's avatar
Christopher Subich committed
315
316

   u_left.resize(size_z); u_right.resize(size_z);
317
   u_left=0; u_right=0;
Christopher Subich's avatar
Christopher Subich committed
318
   ux_left.resize(size_z); ux_right.resize(size_z);
319
   ux_left=0; ux_right=0;
Christopher Subich's avatar
Christopher Subich committed
320
   uz_left.resize(size_z); uz_right.resize(size_z);
321
   uz_left=0; uz_right=0;
Christopher Subich's avatar
Christopher Subich committed
322
323
324
325

   /* Dz/Dzz don't have any local/global pairing */
   Dz.resize(size_z,3); Dzz.resize(size_z,3);

326
327
328
329
330
331
332
333
334
   // And arrays for rebalancing
   for (int i = 0; i < 4; i++) {
      to_lbounds[i] = new int[nproc];
      to_ubounds[i] = new int[nproc];
      tb_lbounds[i] = new int[nproc];
      tb_ubounds[i] = new int[nproc];
      rebalance_setup[i] = false;
   }

Christopher Subich's avatar
Christopher Subich committed
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
   /* Compute global Dx/Dxx operators */
   Array<double,2> global_Dx(Range(xvals.lbound(firstDim),xvals.ubound(firstDim)),Range(0,2));
   Array<double,2> global_Dxx(Range(xvals.lbound(firstDim),xvals.ubound(firstDim)),Range(0,2));
   get_fd_operator(xvals,global_Dx,global_Dxx);

   /* Assign a slice of the global FD operator to our local view.  If there is symmetry,
      the relevant indicies shift by one because x is assumed to include ghost points
      reflecting the boundaries */
   Dx = global_Dx(Range(local_x_lbound +(sym?1:0) ,local_x_ubound + (sym?1:0)),Range(0,2));
   Dxx = global_Dxx(Range(local_x_lbound + (sym?1:0),local_x_ubound + (sym?1:0)),Range(0,2));

   /* There aren't any such complexities for z */
   get_fd_operator(zvals,Dz,Dzz);

   // To coarsen, we'll about halve the number of interior points, keeping boundaries.
   // Because of the ghost point deal, we will always have "boundaries" in the x_line.
   // So, the number of interior points is equal to x_line.extent() - 2
   int interior_points = xvals.extent(firstDim)-2;
   
   // If there are only three x-points, we're at the coarsest level.  This represents
   // one interior point and two boundaries.
//   if (interior_points <= 3) return;
   if ((nproc == 1) && sym != SYM_NONE && interior_points <= COARSE_GRID_SIZE) {
      coarsest_level = true;
      if (myrank == 0) {
         A_cols = new int[size_x*size_z+2];
         A_rows = new int[size_x*size_z*11];
         A_double = new double[size_x*size_z*11];
//         A_rows = new int[10*((size_x-2)*(size_z-2))+5*(2*(size_x-2)+2*(size_z-2))+16+size_x*size_z];
//         A_double = new double[10*((size_x-2)*(size_z-2))+5*(2*(size_x-2)+2*(size_z-2))+16+size_x*size_z];
      }
      return;
   }
   else if ((nproc == 1) && interior_points <= (COARSE_GRID_SIZE-2)) {
      coarsest_level = true;
      if (myrank == 0) {
         A_cols = new int[size_x*size_z+2];
         A_rows = new int[11*(size_x*size_z)];
         A_double = new double[11*(size_x*size_z)];
      }
      return;
   }

   // Otherwise, we want to build a coarse-grid operator
   // The number of interior points is halved, rounding down 
   int size_coarse_x = 2+(interior_points)/2;
   if (sym != SYM_NONE) size_coarse_x = 2+(interior_points+1)/2;
   Array<double,1> coarse_x(size_coarse_x);

//   fprintf(stderr,"Coarse x has size %d\n",size_coarse_x);

   if (sym != SYM_NONE || interior_points % 2 == 1) {
      /* If there's an odd number of interior points, we're set -- just include
         every other one, and we're done modulo smmetry and ghost points */
      if (sym == SYM_NONE) {
         for (int i = 0; i < size_coarse_x; i++)
            coarse_x(i) = xvals(2*i);
      } else {
         for (int i = 0; i < size_coarse_x-2; i++)
            coarse_x(i+1) = xvals(2*i+1);
      }
      kept_interval = -1;
   } else {
      /* If there's an even number of interior points, we have a finer decision
         to make.  We can't evenly halve an odd number of intervals, so we
         get to pick the location where we break our "every other point"
         coarsening.  EG, go from
         x-x-x-x-x (where x is kept, n==9 here) to
         x-x-xx-x-x (n == 10, n==8 interior)
      */
      double dx = 0;
      for (int i = 0; i < size_x-1 + (sym?2:0); i+=2) {
         if (fabs(xvals(i+1)-xvals(i)) > dx) {
            dx = fabs(xvals(i+1)-xvals(i));
            kept_interval = i;
         }
      }
//      fprintf(stderr,"Kept interval is %d\n",kept_interval);
      /* Now, with the kept interval in mind, keep every 2nd point until the
         magic interval, then the point right after said interval, then
         every 2nd point thereafter */
      if (sym == SYM_NONE) {
         for (int i = 0; i < size_coarse_x; i++) 
            coarse_x(i) = xvals(2*i - (2*i > kept_interval));
      } else {
//         for (int i = 0; i < size_coarse_x-2; i++) {
//            coarse_x(i+1) = xvals(2*i + 1 - ((2*i) > kept_interval));
//         }
//         kept_interval -= 1;
      }
         
   }
   /* If we have symmetry, we need to correct the left and right points
      (the "ghost boundary points") to reflect our new interiors.  If the problem
      has symmetry, then the first and last points in xvals reflect the location
      of the second points, mirrored about the true boundary.  We want to keep
      this new mirroring */
   if (sym!= SYM_NONE) {
      if (sym != SYM_PERIODIC) {
         double left_bdy = 0.5*(xvals(0)+xvals(1));
         double right_bdy = 0.5*(xvals(size_x)+xvals(size_x+1));
         coarse_x(0) = coarse_x(1)-2*(coarse_x(1)-left_bdy);
         coarse_x(size_coarse_x-1) = 2*right_bdy - coarse_x(size_coarse_x-2);
         // Also, tweak the "kept interval", since we defined it in the
         // x-including-ghost-point frame and that is no longer appropriate
//         kept_interval -= 1;
      } else { // Periodic
         // In the periodic case, it's kosher to shift the domain boundaries
         // by a fixed amount.  This ensures accuracy in derivatives, since
         // the right ghost point can reflect the true location of the left-most
         // point and vice versa.  Otherwise, differentiation would be inconsistent
         double left_bdy = 0.5*(xvals(0)+xvals(1));
         double right_bdy = 0.5*(xvals(size_x)+xvals(size_x+1));
         double mind_gap = (coarse_x(1)-left_bdy) + (right_bdy - coarse_x(size_coarse_x-2));
         coarse_x(0) = coarse_x(1)-mind_gap;
         coarse_x(size_coarse_x-1) = coarse_x(size_coarse_x-2)+mind_gap;
//         kept_interval -= 1;
      }
   }
         
//   if (myrank == 0) {
//      fprintf(stdout,"x:\n");
//      cout << xvals;
//      fprintf(stdout,"Coarse x:\n");
//      cout << coarse_x;
//   }
//   MPI_Barrier(my_comm);
   // Now, find out what distribution of points we'll have on the coarse array
   int c_extent = coarse_x.extent(firstDim) - (sym ? 2 : 0);
   get_local_split(c_extent,myrank,nproc,coarse_x_lbound,coarse_x_ubound);
   // If the number of coarse points is less than the coarse-grid-size,
   // dump it all on the first processor so we can go ahead with the
   // sparse solve
   if (c_extent <= COARSE_GRID_SIZE) {
      if (myrank == 0) {
         coarse_x_lbound = 0;
         coarse_x_ubound = c_extent - 1;
      } else {
         coarse_x_lbound = c_extent;
         coarse_x_ubound = c_extent - 1;
      }
   }

   // Rebase the coarse_x and coarse_f arrays
   coarse_u.resize(coarse_x_ubound-coarse_x_lbound+1,size_z);
   coarse_u.reindexSelf(TinyVector<int,2>(coarse_x_lbound,0));
   coarse_f.resize(coarse_x_ubound-coarse_x_lbound+1,size_z);
   coarse_f.reindexSelf(TinyVector<int,2>(coarse_x_lbound,0));

   // And resize/rebase the local coarse arrays

   // Need to know whether we have the kept point, if any
   bool have_kept_interval = (local_x_lbound <= kept_interval) && (local_x_ubound >= kept_interval);
   
   // Define the local coarse extent
   int local_coarse_extent = local_size_x/2 + // Half the local size, plus...
                           have_kept_interval + // if we have the kept point, plus..
                           // whether we're the last proc with an odd number of points
                           ((myrank == nproc-1) && (local_size_x%2)); 
                            // (both of these won't be true at once)
   int local_coarse_lbound = local_x_lbound/2 + (kept_interval >= 0 && local_x_lbound > kept_interval);
//   fprintf(stdout,"COARSE: %d %d %d\n",myrank,local_coarse_lbound,local_coarse_extent);
   local_coarse_2d.resize(local_coarse_extent,size_z);
   local_coarse_1d.resize(local_coarse_extent);
   local_coarse_2d.reindexSelf(TinyVector<int,2>(local_coarse_lbound,0));
   local_coarse_1d.reindexSelf(TinyVector<int,1>(local_coarse_lbound));

//   for (int i = 0; i < nproc; i++) {
//      if (myrank == i)
//         fprintf(stdout,"Process %d has %d to %d on coarse array\n",myrank,coarse_x_lbound,coarse_x_ubound);
//      MPI_Barrier(my_comm);
//   }

   // Now, it's possible that not all processors will be used on the coarse
   // level -- that's natural as the number of x-lines approaches 3 (one
   // interior line).  So we'll want to -split- the communicator and only
   // make the coarser-level-object if this node has something to do there.

   if (coarse_x_ubound >= coarse_x_lbound) {
      MPI_Comm_split(my_comm,1,0,&coarse_comm);
      int sz2; MPI_Comm_size(coarse_comm,&sz2);
//      for (int i = 0; i < sz2; i++) {
//         if (i == myrank)
//            fprintf(stdout,"Process %d is recursing\n",myrank);
//         MPI_Barrier(coarse_comm);
//      }
      coarse_solver = new MG_Solver(coarse_x,zvals,sym,coarse_comm);
   } else {
      MPI_Comm_split(my_comm,0,0,&coarse_comm);
      coarse_solver = 0;
   }
   
}
  
/* Line solve */
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
#if 1
void line_solve(Array<double,1> & u, Array<double,1> & f,
      Array<double,1> & A, Array<double,1> & B,
      Array<double,1> & C, double a0, double a1, 
      double b0, double b1, Array<double,2> & Dz,
      Array<double,2> & Dzz) {
   /* The objective here is to solve the line operator as a tridiangonal banded
      matrix.  While LAPACK has solvers for this, timing suggests that the basic
      banded solver is much slower than expected.  Problems that arise in SPINS
      should not be difficult ones that require partial pivoting.  The problem
      to solve is:
          A(z)*Dzz*u + B(z)*dz*u + C(z)*u = f(z), with
          a0*u + b0*Dz*u = f0 @ z=0 (bottom) and
          a1*u + b1*Dz*u = f1 @ z=1 (top) */

   timing_push("line_solve");

   // Build the band matrix itself, re-using the LAPACK formulation
   blitz::firstIndex ii; 
   blitz::Range all = Range::all();
   static Array<double,1> an(u.extent(firstDim)), bn(u.extent(firstDim)), cn(u.extent(firstDim));
   int top = u.ubound(firstDim);
   an = 0; bn = 0; cn = 0;
   /* Set the band matrix based on our input parameters */
   // Main diagonal
   bn(all) = A*Dzz(all,1) + B*Dz(all,1) + C;
   // Subdiagonal
   an(all) = A*Dzz(all,0) + B*Dz(all,0);
   // Superdiagonal
   cn(all) = A*Dzz(all,2) + B*Dz(all,2);

   /* BC's */

   /* Bottom -- a0*u+b0*Dz*u */
   bn(0) = a0 + b0*Dz(0,1); // Main diagonal from Dz
   cn(0) = b0*Dz(0,2); // Superdiagonal from Dz
   an(0) = 0;

   /* Top -- a1*u + b1*Dz*u */
   bn(top) = a1 + b1*Dz(top,1);
   an(top) = b1*Dz(top,0); // Subdiagonal from Dz
   cn(top) = 0;

   // Thomas algorithm, via Wikipedia, note base-zero indexing

   cn(0) = cn(0)/bn(0);
   u(0) = f(0)/bn(0); // Use u for d', which will be overwritten by the final sol'n

   for (int i = 1; i < top; i++) {
      cn(i) = cn(i)/(bn(i)-an(i)*cn(i-1));
      u(i) = (f(i) - an(i)*u(i-1))/(bn(i)-an(i)*cn(i-1));
   }

   u(top) = (f(top)-an(top)*u(top-1))/(bn(top)-an(top)*cn(top-1));

   // Back substitution
   for (int i = top-1; i >= 0; i--) {
      u(i) = u(i) - cn(i)*u(i+1);
   }

   timing_pop(); // line_solve
}
#else
Christopher Subich's avatar
Christopher Subich committed
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
void line_solve(Array<double,1> & u, Array<double,1> & f,
      Array<double,1> & A, Array<double,1> & B,
      Array<double,1> & C, double a0, double a1, 
      double b0, double b1, Array<double,2> & Dz,
      Array<double,2> & Dzz) {
   /* The objective here is to build the operator as a (tridiagonal) banded
      matrix for LAPACK, and to solve that using dgbsv (general band solver)
      to get u from the specified f.  We will be rebuilding the operator
      at each call (but not reallocating the band matrix) because we're
      likely to see this called multiple times with different parameters
      (especially C).  For reference, the problem is:
         A(z)*Dzz*u+B(z)*Dz*u + C(z)*u = f(z), with
         a0*u+b0*Dz*u = f0 @ z=0 (bottom), and
         a1*u+b1*Dz*u = f1 @ z=1 (top)
   */
   /* Allocate the operator as static, since it's large-ish */
//   cout << "Solving: A: " << A << "B: " << B << "C: " << C << "f: " << f;
//   fprintf(stdout,"BCs: %f %f, %f %f\n",a0,a0,b0,b1);
611
   timing_push("line_solve");
Christopher Subich's avatar
Christopher Subich committed
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
   blitz::firstIndex ii; blitz::secondIndex jj;
   static Array<double,2> band_matrix(4,u.extent(firstDim),blitz::columnMajorArray);
   band_matrix = 0;
   int IPIV[u.extent(firstDim)]; // Integer pivots for solve
   if (band_matrix.extent(secondDim) != u.extent(firstDim)) {
      // If our size has somehow changed, resize the band matrix to suit
      band_matrix.resize(4,u.extent(firstDim)); 
   }
   /* Set the band matrix based on our input parameters */
   // Main diagonal
   int top = band_matrix.ubound(secondDim);
   band_matrix(2,Range::all()) = A(ii)*Dzz(Range::all(),1) + 
      B(ii)*Dz(Range::all(),1) + C(ii);
//   // Subdiagonal
   band_matrix(3,Range(0,top-1)) = A(Range(1,top))*Dzz(Range(1,top),0) +
      B(ii)*Dz(Range(1,top),0);
//   // Superdiagonal
   band_matrix(1,Range(1,top)) = A(Range(0,top-1))*Dzz(Range(0,top-1),2) +
      B(ii)*Dz(Range(0,top-1),2);

   /* BC's */
//   band_matrix(Range::all(),0) = 0;
//   band_matrix(Range::all(),top) = 0;

   /* Bottom -- a0*u+b0*Dz*u */
   band_matrix(2,0) = a0 + b0*Dz(0,1); // Main diagonal from Dz
   band_matrix(1,1) = b0*Dz(0,2); // superdiagonal from Dz
   band_matrix(2,top) = a1 + b1*Dz(top,1);
   band_matrix(3,top-1) = b1*Dz(top,0); // Subdiagonal
   
   /* Parameters for LAPACK call */
   int N=f.extent(firstDim), KL=1, KU=1, NRHS=1, LDAB=4, LDB=N, INFO=-1;
   // dgbsv overwrites f with u, so copy
   u = f;
   // LAPACK call
//   cout << "Band matrix: " << band_matrix;
648
   timing_push("line_solve_lapack");
Christopher Subich's avatar
Christopher Subich committed
649
650
651
652
653
654
655
656
   dgbsv_(&N, &KL, &KU, &NRHS, band_matrix.data(), &LDAB, IPIV, u.data(), &LDB, &INFO);
//   cout << "Returning u:" << u << "Info " << INFO << endl;
   if (INFO != 0) {
      cerr << "Bad line solve, no cookie! (" << INFO << ")\n";
      cerr << A << B << C;
      cerr << band_matrix;
      cerr << f;
   }
657
   timing_pop();
Christopher Subich's avatar
Christopher Subich committed
658
   assert(INFO == 0);
659
   timing_pop();
Christopher Subich's avatar
Christopher Subich committed
660
}
661
#endif
Christopher Subich's avatar
Christopher Subich committed
662
663
664
665
666

/* Copy coefficients to the local object, and coarsen for the subproblem */
void MG_Solver::problem_setup(Array<double,2> & Uxx, Array<double,2> & Uzz,
      Array<double,2> & Uxz, Array<double,2> & Ux, Array<double,2> & Uz) {
   coarse_numeric_ok = false;
667
668
669
670
   rebalance_array(Uxx,uxx,my_comm,FFine); // uxx/etc are sized as per the local, internal grid;
   rebalance_array(Uzz,uzz,my_comm,FFine); // from an array-size point of view this looks
   rebalance_array(Ux,ux,my_comm,FFine);   // equivalent to the balancing of input-f
   rebalance_array(Uz,uz,my_comm,FFine);
Christopher Subich's avatar
Christopher Subich committed
671
//   SYNC(cerr << Uxz; cerr.flush());
672
   rebalance_array(Uxz,uxz,my_comm,FFine);
Christopher Subich's avatar
Christopher Subich committed
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
//   SYNC(cerr << uxz; cerr.flush());
//   MPI_Finalize(); exit(1);
   /* Do coarsening */
   Array<double,2> CUxx, CUzz, CUxz, CUx, CUz;
//   int csize = coarse_x_ubound-coarse_x_lbound+1;
   int csize = coarse_u.extent(firstDim);
//   fprintf(stdout,"Coarse size %d\n",csize);
//   MPI_Barrier(my_comm);
//   if (csize <= 0) return;
   if (coarsest_level) {
      coarse_numeric_ok = false;
      if (any(Uxz!=0)) {
         if (!any_dxz) coarse_symbolic_ok = false;
         any_dxz = true;
      } else {
         if (any_dxz) coarse_symbolic_ok = false;
         any_dxz = false;
      }
      return;
   }
   TinyVector<int,2> cbase(coarse_x_lbound,0);
   CUxx.resize(csize,size_z); CUxx.reindexSelf(coarse_u.lbound());
   CUzz.resize(csize,size_z); CUzz.reindexSelf(coarse_u.lbound());
   CUxz.resize(csize,size_z); CUxz.reindexSelf(coarse_u.lbound());
   CUx.resize(csize,size_z); CUx.reindexSelf(coarse_u.lbound());
   CUz.resize(csize,size_z); CUz.reindexSelf(coarse_u.lbound());
//   fprintf(stdout,"Coarsening Uxx, size %dx%d\n",Uxx.extent(firstDim),Uxx.extent(secondDim));
   coarsen_grid(uxx,true); 
//   fprintf(stdout,"Rebalancing coarsened Uxx\n");
702
703
704
705
706
   rebalance_array(local_coarse_2d,CUxx,my_comm,FCoarse);
   coarsen_grid(uxz,true); rebalance_array(local_coarse_2d,CUxz,my_comm,FCoarse);
   coarsen_grid(uzz,true); rebalance_array(local_coarse_2d,CUzz,my_comm,FCoarse);
   coarsen_grid(ux,true); rebalance_array(local_coarse_2d,CUx,my_comm,FCoarse);
   coarsen_grid(uz,true); rebalance_array(local_coarse_2d,CUz,my_comm,FCoarse);
Christopher Subich's avatar
Christopher Subich committed
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
   if (coarse_solver)
      coarse_solver->problem_setup(CUxx,CUzz,CUxz,CUx,CUz);
}

void MG_Solver::helmholtz_setup(double h) {
   coarse_numeric_ok = false;
   helm_parameter = h;
   if (coarsest_level)
      check_bc_consistency();
   if (coarse_solver)
      coarse_solver -> helmholtz_setup(h);
}

void MG_Solver::bc_setup(int dim, Array<double,1> u_min, Array<double,1> uz_min,
      Array<double,1> ux_min, Array<double,1> u_max, Array<double,1> uz_max,
      Array<double,1> ux_max) {
   /* Copy BC data to the local object, and punt down the chain for a
      cosarse problem */
   coarse_numeric_ok = false;
//   MPI_Finalize(); exit(1);
   if (dim == 0) {
      // x -- no balancing needed
      u_left = u_min; u_right = u_max;
      ux_left = ux_min; ux_right = ux_max;
      uz_left = uz_min; uz_right = uz_max;
      if (coarsest_level) {
         check_bc_consistency();
      }
      if (coarse_solver)
         coarse_solver->bc_setup(dim,u_min,uz_min,ux_min,u_max,uz_max,ux_max);
   } else if (dim == 1) {
      // z -- balance
      Array<double,2> bc_incoming(Range(u_min.lbound(firstDim),u_min.ubound(firstDim)),Range(0,5));
      Array<double,2> bc_here(Range(local_x_lbound,local_x_ubound),Range(0,5));
      bc_here = 0;
      bc_incoming(Range::all(),0) = u_min;
      bc_incoming(Range::all(),1) = u_max;
      bc_incoming(Range::all(),2) = uz_min;
      bc_incoming(Range::all(),3) = uz_max;
      bc_incoming(Range::all(),4) = ux_min;
      bc_incoming(Range::all(),5) = ux_max;
748
      rebalance_array(bc_incoming,bc_here,my_comm,FFine);
Christopher Subich's avatar
Christopher Subich committed
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
      u_bot = bc_here(Range::all(),0);
      u_top = bc_here(Range::all(),1);
      uz_bot = bc_here(Range::all(),2);
      uz_top = bc_here(Range::all(),3);
      ux_bot = bc_here(Range::all(),4);
      ux_top = bc_here(Range::all(),5);
      // Coarsen

      Array<double,1> cu_min, cuz_min, cux_min, cu_max, cux_max, cuz_max;
      int c_size = coarse_x_ubound-coarse_x_lbound+1;
//      if (c_size <= 0) return; // coarsest grid
      if (coarsest_level) {
//         fprintf(stdout,"Coarsest level, so not recursing\n");
         check_bc_consistency();
         return;
      }
      TinyVector<int,1> c_base(coarse_x_lbound);
//      fprintf(stdout,"Line BC, size %d base %d\n",c_size,c_base(0));
      cu_min.resize(c_size); cu_min.reindexSelf(c_base);
      cuz_min.resize(c_size); cuz_min.reindexSelf(c_base);
      cux_min.resize(c_size); cux_min.reindexSelf(c_base);
      cu_max.resize(c_size); cu_max.reindexSelf(c_base);
      cux_max.resize(c_size); cux_max.reindexSelf(c_base);
      cuz_max.resize(c_size); cuz_max.reindexSelf(c_base);
773
774
775
776
777
778
      coarsen_line(u_bot); rebalance_line(local_coarse_1d,cu_min,my_comm,FCoarse);
      coarsen_line(u_top); rebalance_line(local_coarse_1d,cu_max,my_comm,FCoarse);
      coarsen_line(ux_bot); rebalance_line(local_coarse_1d,cux_min,my_comm,FCoarse);
      coarsen_line(ux_top); rebalance_line(local_coarse_1d,cux_max,my_comm,FCoarse);
      coarsen_line(uz_bot); rebalance_line(local_coarse_1d,cuz_min,my_comm,FCoarse);
      coarsen_line(uz_top); rebalance_line(local_coarse_1d,cuz_max,my_comm,FCoarse);
Christopher Subich's avatar
Christopher Subich committed
779
780
781
782
783
784
785
786
787
788
789
790
      if (coarse_solver) 
         coarse_solver->bc_setup(dim,cu_min,cuz_min,cux_min,cu_max,cuz_max,cux_max);
         
   }
}
   
   

void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) {
   /* Assuming that the x-array is properly load balanced for our communicator,
      apply the operator A*x = f */
   /* Local arrays for left and right x lines, for parallel communication */
791
   timing_push("apply_operator");
Christopher Subich's avatar
Christopher Subich committed
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
   Array<double,1> uin_left(size_z) ,  uin_right(size_z) ;
   uin_left = -9e9; uin_right = -9e9;

   /* Get rank and size of MPI Communicator */

   if (nproc > 1 || symmetry_type == SYM_PERIODIC) {
      /* We'll have to communicate with neighbours */
      int lefty = myrank-1;
      int righty = myrank+1;
      MPI_Status ignoreme;
      /* If we're the leftmost processor and we're not a periodic problem,
         then we don't need to communicate with anybody for our left point */
      if (myrank == 0 && symmetry_type != SYM_PERIODIC) {
         lefty = MPI_PROC_NULL;
      } else if (myrank == 0) {
         lefty = nproc - 1;
      }
      /* Same for rightmost processor and right point */
      if (myrank == nproc - 1 && symmetry_type != SYM_PERIODIC) {
         righty = MPI_PROC_NULL;
      } else if (myrank == nproc - 1){
         righty = 0;
      }
      /* Now, send left points, and receive right points */
//      fprintf(stderr,"%d sending left (%d, %d)\n",myrank,lefty,righty);
817
      timing_push("apply_op_mpi");
Christopher Subich's avatar
Christopher Subich committed
818
819
820
821
822
823
      MPI_Sendrecv(&u(u.lbound(firstDim),0),size_z,MPI_DOUBLE,lefty,0,
            uin_right.data(),size_z,MPI_DOUBLE,righty,0,my_comm,&ignoreme);
      /* And vice versa -- send right, receive left */
//      fprintf(stderr,"%d sending right (%d, %d)\n",myrank,righty,lefty);
      MPI_Sendrecv(&u(u.ubound(firstDim),0),size_z,MPI_DOUBLE,righty,0,
            uin_left.data(),size_z,MPI_DOUBLE,lefty,0,my_comm,&ignoreme);
824
      timing_pop();
Christopher Subich's avatar
Christopher Subich committed
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//      fprintf(stderr,"%d received:\n");
//      fprintf(stderr,"%d done\n",myrank);
   }
   /* Fake left/right boundaries for even/odd symmetry */
   if (myrank == 0 && symmetry_type == SYM_EVEN) {
      uin_left = u(local_x_lbound,Range::all());
   } else if (myrank == 0 && symmetry_type == SYM_ODD) {
      uin_left = -u(local_x_lbound,Range::all());
   }
   /* Right */
   if (myrank == nproc - 1 && symmetry_type == SYM_EVEN) {
      uin_right = u(local_x_ubound,Range::all());
   } else if (myrank == nproc - 1 && symmetry_type == SYM_ODD) {
      uin_right = -u(local_x_ubound,Range::all());
   }
   /* By this point, we either have boundary data in uin_left/uin_right or
      we don't need it on account of being the boundary */
   /* We'll deal with the boundary points separately, since they need
      special tender love and care.  However, the interior is relatively
      easy */

   blitz::firstIndex ii; blitz::secondIndex jj;
   for (int i = local_x_lbound+1; i <= local_x_ubound-1; i++) {
      for (int j = 1; j <= size_z-2; j++) {
         f(i,j) = 
            u(i,j)*(helm_parameter + 
                  ux(i,j)*Dx(i,1) + uz(i,j)*Dz(j,1) +
                  uxx(i,j)*Dxx(i,1) + uzz(i,j)*Dzz(j,1) +
                  uxz(i,j)*Dx(i,1)*Dz(j,1)) +
            u(i-1,j)*(
                  ux(i,j)*Dx(i,0) + uxx(i,j)*Dxx(i,0) +
                  uxz(i,j)*Dx(i,0)*Dz(j,1)) +
            u(i+1,j)*(
                  ux(i,j)*Dx(i,2) + uxx(i,j)*Dxx(i,2) +
                  uxz(i,j)*Dx(i,2)*Dz(j,1)) +
            u(i,j-1)*(
                  uz(i,j)*Dz(j,0) + uzz(i,j)*Dzz(j,0) +
                  uxz(i,j)*Dx(i,1)*Dz(j,0)) +
            u(i,j+1)*(
                  uz(i,j)*Dz(j,2) + uzz(i,j)*Dzz(j,2) +
                  uxz(i,j)*Dx(i,1)*Dz(j,2)) +
            uxz(i,j)*(u(i-1,j-1)*Dx(i,0)*Dz(j,0) +
                  u(i+1,j-1)*Dx(i,2)*Dz(j,0) +
                  u(i-1,j+1)*Dx(i,0)*Dz(j,2) +
                  u(i+1,j+1)*Dx(i,2)*Dz(j,2));
      }
   }
   /* Top and bottom BCs */
   for (int i = local_x_lbound+1; i <= local_x_ubound - 1; i++) {
      int j = 0;
      f(i,j) = u(i,j)*(u_bot(i) + ux_bot(i)*Dx(i,1) + uz_bot(i)*Dz(j,1)) +
            u(i-1,j)*ux_bot(i)*Dx(i,0) + u(i+1,j)*ux_bot(i)*Dx(i,2) +
            u(i,j+1)*uz_bot(i)*Dz(j,2);
//      fprintf(stderr,"Bottom %d\n %g = %g*(%g + %g*%g + %g*%g) + %g*%g*%g + %g*%g*%g + %g*%g*%g\n",i,
//            f(i,j),u(i,j),u_bot(i),ux_bot(i),Dx(i,1),uz_bot(i),Dz(j,1),
//            u(i-1,j),ux_bot(i),Dx(i,0),u(i+1,j),ux_bot(i),Dx(i,2),
//            u(i,j+1),uz_bot(i),Dz(j,2));
//      fprintf(stderr,"Bottom %d %gL %gH %gR %gU = %g\n    (%g %g %g)\n",
//            i,u(i-1,j),u(i,j),u(i+1,j),u(i,j+1),f(i,j),
//            u_bot(i),ux_bot(i),uz_bot(i));;
//      fprintf(stderr,"(%d,%d) stencil:\n",i,j);
//      fprintf(stderr,"%g u, %g ux, %g uz\n",u_bot(i),ux_bot(i),uz_bot(i));
//      fprintf(stderr,"%g left %g here %g right %g up\n",u(i-1,j),u(i,j),u(i+1,j),u(i,j+1));
//      fprintf(stderr,"[%g %g %g]x [-- %g %g]z\n",Dx(i,0),Dx(i,1),Dx(i,2),Dz(j,1),Dz(j,2));
//      MPI_Finalize(); exit(1);
      j=size_z-1;
      f(i,j) = u(i,j)*(u_top(i) + ux_top(i)*Dx(i,1) + uz_top(i)*Dz(j,1)) +
            u(i-1,j)*ux_top(i)*Dx(i,0) + u(i+1,j)*ux_top(i)*Dx(i,2) +
            u(i,j-1)*uz_top(i)*Dz(j,0);
   }
   /* Now, handle non-boundary interiors */
   /* Left boundary */
   if (symmetry_type != SYM_NONE || (myrank != 0)) {
      int i = local_x_lbound;
      for (int j = 1; j <= size_z-2; j++) {
         f(i,j) = 
            u(i,j)*(helm_parameter + 
                  ux(i,j)*Dx(i,1) + uz(i,j)*Dz(j,1) +
                  uxx(i,j)*Dxx(i,1) + uzz(i,j)*Dzz(j,1) +
                  uxz(i,j)*Dx(i,1)*Dz(j,1)) +
            uin_left(j)*(
                  ux(i,j)*Dx(i,0) + uxx(i,j)*Dxx(i,0) +
                  uxz(i,j)*Dx(i,0)*Dz(j,1)) +
            u(i+1,j)*(
                  ux(i,j)*Dx(i,2) + uxx(i,j)*Dxx(i,2) +
                  uxz(i,j)*Dx(i,2)*Dz(j,1)) +
            u(i,j-1)*(
                  uz(i,j)*Dz(j,0) + uzz(i,j)*Dzz(j,0) +
                  uxz(i,j)*Dx(i,1)*Dz(j,0)) +
            u(i,j+1)*(
                  uz(i,j)*Dz(j,2) + uzz(i,j)*Dzz(j,2) +
                  uxz(i,j)*Dx(i,1)*Dz(j,2)) +
            uxz(i,j)*(uin_left(j-1)*Dx(i,0)*Dz(j,0) +
                  u(i+1,j-1)*Dx(i,2)*Dz(j,0) +
                  uin_left(j+1)*Dx(i,0)*Dz(j,2) +
                  u(i+1,j+1)*Dx(i,2)*Dz(j,2));
      }
      int j = 0;
      f(i,j) = u(i,j)*(u_bot(i) + ux_bot(i)*Dx(i,1) + uz_bot(i)*Dz(j,1)) +
            uin_left(j)*ux_bot(i)*Dx(i,0) + u(i+1,j)*ux_bot(i)*Dx(i,2) +
            u(i,j+1)*uz_bot(i)*Dz(j,2);
//      fprintf(stderr,"Bottom %d %gL %gH %gR %gU = %g\n",i,uin_left(j),u(i,j),u(i+1,j),u(i,j+1),f(i,j));
      j=size_z-1;
      f(i,j) = u(i,j)*(u_top(i) + ux_top(i)*Dx(i,1) + uz_top(i)*Dz(j,1)) +
            uin_left(j)*ux_top(i)*Dx(i,0) + u(i+1,j)*ux_top(i)*Dx(i,2) +
            u(i,j-1)*uz_top(i)*Dz(j,0);
   }
   /* Right boundary */
   if (symmetry_type != SYM_NONE || (myrank != nproc-1)) {
      int i = local_x_ubound;
      for (int j = 1; j <= size_z-2; j++) {
         f(i,j) = 
            u(i,j)*(helm_parameter + 
                  ux(i,j)*Dx(i,1) + uz(i,j)*Dz(j,1) +
                  uxx(i,j)*Dxx(i,1) + uzz(i,j)*Dzz(j,1) +
                  uxz(i,j)*Dx(i,1)*Dz(j,1)) +
            u(i-1,j)*(
                  ux(i,j)*Dx(i,0) + uxx(i,j)*Dxx(i,0) +
                  uxz(i,j)*Dx(i,0)*Dz(j,1)) +
            uin_right(j)*(
                  ux(i,j)*Dx(i,2) + uxx(i,j)*Dxx(i,2) +
                  uxz(i,j)*Dx(i,2)*Dz(j,1)) +
            u(i,j-1)*(
                  uz(i,j)*Dz(j,0) + uzz(i,j)*Dzz(j,0) +
                  uxz(i,j)*Dx(i,1)*Dz(j,0)) +
            u(i,j+1)*(
                  uz(i,j)*Dz(j,2) + uzz(i,j)*Dzz(j,2) +
                  uxz(i,j)*Dx(i,1)*Dz(j,2)) +
            uxz(i,j)*(u(i-1,j-1)*Dx(i,0)*Dz(j,0) +
                  uin_right(j-1)*Dx(i,2)*Dz(j,0) +
                  u(i-1,j+1)*Dx(i,0)*Dz(j,2) +
                  uin_right(j+1)*Dx(i,2)*Dz(j,2));
//         fprintf(stderr,"%d: %f %f %f\n",j,u(i-1,j),u(i,j),uin_right(j));
//         fprintf(stderr,"   [%f %f %f]=%f\n",uxx(i,j)*Dxx(i,0),uxx(i,j)*Dxx(i,1),uxx(i,j)*Dxx(i,2),f(i,j));
      }
      int j = 0;
      f(i,j) = u(i,j)*(u_bot(i) + ux_bot(i)*Dx(i,1) + uz_bot(i)*Dz(j,1)) +
            u(i-1,j)*ux_bot(i)*Dx(i,0) + uin_right(j)*ux_bot(i)*Dx(i,2) +
            u(i,j+1)*uz_bot(i)*Dz(j,2);
//      fprintf(stderr,"Bottom %d %gL %gH %gR %gU = %g\n",i,u(i-1,j),u(i,j),uin_right(j),u(i,j+1),f(i,j));
      j=size_z-1;
      f(i,j) = u(i,j)*(u_top(i) + ux_top(i)*Dx(i,1) + uz_top(i)*Dz(j,1)) +
            u(i-1,j)*ux_top(i)*Dx(i,0) + uin_right(j)*ux_top(i)*Dx(i,2) +
            u(i,j-1)*uz_top(i)*Dz(j,0);
   }

   /* Left and right BC's */
   if (symmetry_type == SYM_NONE) {
      if (local_x_lbound == 0) {
         int i = 0;
         for (int j = 1; j <= size_z-2; j++) {
            f(i,j) = u(i,j)*(u_left(j) + ux_left(j)*Dx(i,1) + uz_left(j)*Dz(j,1)) +
               u(i,j-1)*uz_left(j)*Dz(j,0) + u(i,j+1)*uz_left(j)*Dz(j,2) +
               u(i+1,j)*ux_left(j)*Dx(i,2);
         }
         int j=0;
         /* The corner points are special.  With a finite difference operator
            and typical "normal" boundary conditions (no tangential derivative),
            the corner points don't actually enter into the calculation anywhere.
            However, if this FD operator arises from a transformed grid, the corner
            points will affect things through the Jacobian, which will result in
            tangential derivatives.

            So, we have a choice -- we can consider the corner points to belong
            to the top/bottom or left/right.  Since we're stereotypically going
            to have solid boundaries at top/bottom, that's where we'll take
            it -- that way a viscous problem has u=0 at the corners */
         f(i,j) = u(i,j)*(u_bot(i) + ux_bot(i)*Dx(i,1) + uz_bot(i)*Dz(j,1)) +
            u(i,j+1)*uz_bot(i)*Dz(j,2) + u(i+1,j)*ux_bot(i)*Dx(i,2);
         j=size_z-1;
         f(i,j) = u(i,j)*(u_top(i) + ux_top(i)*Dx(i,1) + uz_top(i)*Dz(j,1)) +
            u(i,j-1)*uz_top(i)*Dz(j,0) + u(i+1,j)*ux_top(i)*Dx(i,2);

      } 
      if (local_x_ubound == size_x-1) {
         int i = size_x-1;