Commit 3581d1c1 authored by Christopher Subich's avatar Christopher Subich
Browse files

Simplified tridiangonal solve algorithm; timing extensions

First and most trivially, this change adds a few more timing push/pop
invocations relating to multigrid and especially to some of the MPI
synchronization steps that might cause processors to wait on each other.

More substantively, this change also adds a new tridiangonal solver
based on the bog-standard forwards/backwards elimination Thompson
algorithm (see Wikipedia).  This should be acceptable because the 1D
problems being solved themselves come from a 2D problem, so we don't
expect ill-conditioning; calling out to the GMRES banded solver was
surprisingly a computational bottleneck perhaps because of pivoting.

This change seems to decrease the line-solve time by about 80%, which in
turn decreases the overall runtime (tank_rho test case) by 40%.
parent 03e8f292
...@@ -347,8 +347,8 @@ class mapiw : public BaseCase { ...@@ -347,8 +347,8 @@ class mapiw : public BaseCase {
t_startup = clock_time - real_start_time; t_startup = clock_time - real_start_time;
fprintf(stdout,"Start-up time: %g s.\n",t_startup); fprintf(stdout,"Start-up time: %g s.\n",t_startup);
} }
if (master() && itercount == 1) fprintf(stdout,"[Iter], (Clock time), Sim time:, Max U, Max V, Max W, Max KE, Total KE, Max Density\n"); if (master() && itercount == 1) fprintf(stderr,"[Iter], (Clock time), Sim time:, Max U, Max V, Max W, Max KE, Total KE, Max Density\n");
if (master()) fprintf(stdout,"[%d] (%.12g) %.6f: %.6g %.6g %.6g %.6g %.6g %.6g\n", if (master()) fprintf(stderr,"[%d] (%.12g) %.6f: %.6g %.6g %.6g %.6g %.6g %.6g\n",
itercount,t_step,time,max_u,max_v,max_w,max_ke,ke_tot,max_rho); itercount,t_step,time,max_u,max_v,max_w,max_ke,ke_tot,max_rho);
// Write out the chain if savechain is true // Write out the chain if savechain is true
...@@ -581,7 +581,24 @@ int main(int argc, char ** argv) { ...@@ -581,7 +581,24 @@ int main(int argc, char ** argv) {
double now = MPI_Wtime(); double now = MPI_Wtime();
if (master()) fprintf(stderr,"Total runtime complete in %gs\n",now-start_time); if (master()) fprintf(stderr,"Total runtime complete in %gs\n",now-start_time);
if (master()) timing_stack_report(); {
int numprocs, myrank;
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
for (int i = 0; i < numprocs; i++) {
if (myrank == i) {
MPI_Barrier(MPI_COMM_WORLD);
fprintf(stderr,"Processor %d timing report\n",myrank);
timing_stack_report();
MPI_Barrier(MPI_COMM_WORLD);
} else {
MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
}
}
//if (master()) timing_stack_report();
MPI_Finalize(); // Cleanly exit MPI MPI_Finalize(); // Cleanly exit MPI
return 0; // End the program return 0; // End the program
} }
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "Parformer.hpp" #include "Parformer.hpp"
#include "gmres_2d_solver.hpp" #include "gmres_2d_solver.hpp"
#include "Par_util.hpp" #include "Par_util.hpp"
#include "timing.hpp"
// private interface for gmres-solving class // private interface for gmres-solving class
#include "gmres_2d_solver_impl.hpp" #include "gmres_2d_solver_impl.hpp"
...@@ -823,7 +824,9 @@ void Cheb_2dmg::precondition(fbox * & rhs, ubox * & soln) { ...@@ -823,7 +824,9 @@ void Cheb_2dmg::precondition(fbox * & rhs, ubox * & soln) {
// SYNC(cerr << *r2d;cerr.flush()); // SYNC(cerr << *r2d;cerr.flush());
// fprintf(stderr,"Mean condition: %g\n",rhs->mean); // fprintf(stderr,"Mean condition: %g\n",rhs->mean);
//fprintf(stderr,"Precond: max rhs %g\n",max(abs(*(rhs->gridbox)))); //fprintf(stderr,"Precond: max rhs %g\n",max(abs(*(rhs->gridbox))));
timing_push("mg_cycle");
mg_precond->cycle(CYCLE_F,*r2d,*s2d, rhs->mean,soln->sigma,0,1,2); mg_precond->cycle(CYCLE_F,*r2d,*s2d, rhs->mean,soln->sigma,0,1,2);
timing_pop();
//mg_precond->cycle(CYCLE_V,*r2d,*s2d, rhs->mean,soln->sigma,1,0,0); //mg_precond->cycle(CYCLE_V,*r2d,*s2d, rhs->mean,soln->sigma,1,0,0);
//fprintf(stderr,"Precond: max soln %g\n",max(abs(*s2d))); //fprintf(stderr,"Precond: max soln %g\n",max(abs(*s2d)));
......
...@@ -157,6 +157,7 @@ void MG_Solver::rebalance_line(Array<double,1> & orig, Array<double,1> & balance ...@@ -157,6 +157,7 @@ void MG_Solver::rebalance_line(Array<double,1> & orig, Array<double,1> & balance
if (balance.extent(firstDim) > 0) balance = b_2d(Range::all(),0); if (balance.extent(firstDim) > 0) balance = b_2d(Range::all(),0);
} }
void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balance, MPI_Comm c, BalanceGroup btype) { void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balance, MPI_Comm c, BalanceGroup btype) {
timing_push("rebalance");
int o_lbound = orig.lbound(firstDim); int o_lbound = orig.lbound(firstDim);
int o_ubound = orig.ubound(firstDim); int o_ubound = orig.ubound(firstDim);
int b_lbound = balance.lbound(firstDim); int b_lbound = balance.lbound(firstDim);
...@@ -250,8 +251,11 @@ void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balanc ...@@ -250,8 +251,11 @@ void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balanc
assert(orig.size() == s_total && balance.size() == r_total); assert(orig.size() == s_total && balance.size() == r_total);
timing_push("rebalance_mpi");
MPI_Alltoallv(orig_data,s_counts,s_displs,MPI_DOUBLE, MPI_Alltoallv(orig_data,s_counts,s_displs,MPI_DOUBLE,
balance.data(),r_counts,r_displs,MPI_DOUBLE,c); balance.data(),r_counts,r_displs,MPI_DOUBLE,c);
timing_pop(); // rebalance_mpi
timing_pop(); // rebalance
// And done // And done
} }
void MG_Solver::set_x_symmetry(SYM_TYPE s) { void MG_Solver::set_x_symmetry(SYM_TYPE s) {
...@@ -523,6 +527,69 @@ MG_Solver::MG_Solver(Array<double,1> xvals, blitz::Array<double,1> zvals, ...@@ -523,6 +527,69 @@ MG_Solver::MG_Solver(Array<double,1> xvals, blitz::Array<double,1> zvals,
} }
/* Line solve */ /* Line solve */
#if 1
void line_solve(Array<double,1> & u, Array<double,1> & f,
Array<double,1> & A, Array<double,1> & B,
Array<double,1> & C, double a0, double a1,
double b0, double b1, Array<double,2> & Dz,
Array<double,2> & Dzz) {
/* The objective here is to solve the line operator as a tridiangonal banded
matrix. While LAPACK has solvers for this, timing suggests that the basic
banded solver is much slower than expected. Problems that arise in SPINS
should not be difficult ones that require partial pivoting. The problem
to solve is:
A(z)*Dzz*u + B(z)*dz*u + C(z)*u = f(z), with
a0*u + b0*Dz*u = f0 @ z=0 (bottom) and
a1*u + b1*Dz*u = f1 @ z=1 (top) */
timing_push("line_solve");
// Build the band matrix itself, re-using the LAPACK formulation
blitz::firstIndex ii;
blitz::Range all = Range::all();
static Array<double,1> an(u.extent(firstDim)), bn(u.extent(firstDim)), cn(u.extent(firstDim));
int top = u.ubound(firstDim);
an = 0; bn = 0; cn = 0;
/* Set the band matrix based on our input parameters */
// Main diagonal
bn(all) = A*Dzz(all,1) + B*Dz(all,1) + C;
// Subdiagonal
an(all) = A*Dzz(all,0) + B*Dz(all,0);
// Superdiagonal
cn(all) = A*Dzz(all,2) + B*Dz(all,2);
/* BC's */
/* Bottom -- a0*u+b0*Dz*u */
bn(0) = a0 + b0*Dz(0,1); // Main diagonal from Dz
cn(0) = b0*Dz(0,2); // Superdiagonal from Dz
an(0) = 0;
/* Top -- a1*u + b1*Dz*u */
bn(top) = a1 + b1*Dz(top,1);
an(top) = b1*Dz(top,0); // Subdiagonal from Dz
cn(top) = 0;
// Thomas algorithm, via Wikipedia, note base-zero indexing
cn(0) = cn(0)/bn(0);
u(0) = f(0)/bn(0); // Use u for d', which will be overwritten by the final sol'n
for (int i = 1; i < top; i++) {
cn(i) = cn(i)/(bn(i)-an(i)*cn(i-1));
u(i) = (f(i) - an(i)*u(i-1))/(bn(i)-an(i)*cn(i-1));
}
u(top) = (f(top)-an(top)*u(top-1))/(bn(top)-an(top)*cn(top-1));
// Back substitution
for (int i = top-1; i >= 0; i--) {
u(i) = u(i) - cn(i)*u(i+1);
}
timing_pop(); // line_solve
}
#else
void line_solve(Array<double,1> & u, Array<double,1> & f, void line_solve(Array<double,1> & u, Array<double,1> & f,
Array<double,1> & A, Array<double,1> & B, Array<double,1> & A, Array<double,1> & B,
Array<double,1> & C, double a0, double a1, Array<double,1> & C, double a0, double a1,
...@@ -591,6 +658,7 @@ void line_solve(Array<double,1> & u, Array<double,1> & f, ...@@ -591,6 +658,7 @@ void line_solve(Array<double,1> & u, Array<double,1> & f,
assert(INFO == 0); assert(INFO == 0);
timing_pop(); timing_pop();
} }
#endif
/* Copy coefficients to the local object, and coarsen for the subproblem */ /* Copy coefficients to the local object, and coarsen for the subproblem */
void MG_Solver::problem_setup(Array<double,2> & Uxx, Array<double,2> & Uzz, void MG_Solver::problem_setup(Array<double,2> & Uxx, Array<double,2> & Uzz,
...@@ -1804,6 +1872,7 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> & ...@@ -1804,6 +1872,7 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
Array<double,2> defect(f.lbound(),f.extent()), correction(u.lbound(),u.extent()); Array<double,2> defect(f.lbound(),f.extent()), correction(u.lbound(),u.extent());
//fprintf(stderr,"_cycle: mean condition %g\n",extra_in); //fprintf(stderr,"_cycle: mean condition %g\n",extra_in);
// if (master()) fprintf(stderr,"Call to _cycle[%d]\n",size_x); // if (master()) fprintf(stderr,"Call to _cycle[%d]\n",size_x);
//timing_push("_cycle");
if (coarsest_level) { if (coarsest_level) {
// Coarse-grid solve // Coarse-grid solve
if (myrank != 0) return; if (myrank != 0) return;
...@@ -1949,13 +2018,17 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> & ...@@ -1949,13 +2018,17 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
// SYNC(cerr << coarse_f); // SYNC(cerr << coarse_f);
if (coarse_solver) { if (coarse_solver) {
// if (master()) fprintf(stderr,"Recursive(V) solve\n"); // if (master()) fprintf(stderr,"Recursive(V) solve\n");
//timing_pop(); // pop _cycle to avoid recusive nesting
coarse_solver->_cycle(CYCLE_V,coarse_f,coarse_u, coarse_solver->_cycle(CYCLE_V,coarse_f,coarse_u,
coarse_extra_in,coarse_extra_out,pre,mid,post); coarse_extra_in,coarse_extra_out,pre,mid,post);
//timing_push("_cycle");
// if (master()) fprintf(stderr,"Recursive(V) solve finished\n"); // if (master()) fprintf(stderr,"Recursive(V) solve finished\n");
} else { } else {
//fprintf(stderr,"No coarse solve\n"); //fprintf(stderr,"No coarse solve\n");
} }
timing_push("cycle_sync");
MPI_Bcast(&coarse_extra_out,1,MPI_DOUBLE,0,my_comm); MPI_Bcast(&coarse_extra_out,1,MPI_DOUBLE,0,my_comm);
timing_pop();
// if (master()) fprintf(stderr,"V[%d] local coarse correction\n",size_x); // if (master()) fprintf(stderr,"V[%d] local coarse correction\n",size_x);
// SYNC(cerr << coarse_u); // SYNC(cerr << coarse_u);
rebalance_array(coarse_u,local_coarse_2d,my_comm,UCoarse); rebalance_array(coarse_u,local_coarse_2d,my_comm,UCoarse);
...@@ -1980,13 +2053,17 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> & ...@@ -1980,13 +2053,17 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
rebalance_array(local_coarse_2d,coarse_f,my_comm,FCoarse); rebalance_array(local_coarse_2d,coarse_f,my_comm,FCoarse);
if (coarse_solver) { if (coarse_solver) {
// if (master()) fprintf(stderr,"F[%d] Recursive(F) solve begin\n",size_x); // if (master()) fprintf(stderr,"F[%d] Recursive(F) solve begin\n",size_x);
//timing_pop(); // pop _cycle
coarse_solver->_cycle(CYCLE_F,coarse_f,coarse_u, coarse_solver->_cycle(CYCLE_F,coarse_f,coarse_u,
extra_in,coarse_extra_out,pre,mid,post); extra_in,coarse_extra_out,pre,mid,post);
//timing_push("_cycle");
// if (master()) fprintf(stderr,"F[%d] Recursive(F) solve end\n",size_x); // if (master()) fprintf(stderr,"F[%d] Recursive(F) solve end\n",size_x);
} else { } else {
// fprintf(stderr,"No coarse solve\n"); // fprintf(stderr,"No coarse solve\n");
} }
timing_push("cycle_sync");
MPI_Bcast(&coarse_extra_out,1,MPI_DOUBLE,0,my_comm); MPI_Bcast(&coarse_extra_out,1,MPI_DOUBLE,0,my_comm);
timing_pop();
// if (master()) { // if (master()) {
// fprintf(stderr,"coarseu: %d - %d\n",coarse_u.lbound(firstDim),coarse_u.ubound(firstDim)); // fprintf(stderr,"coarseu: %d - %d\n",coarse_u.lbound(firstDim),coarse_u.ubound(firstDim));
// } // }
...@@ -2021,12 +2098,16 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> & ...@@ -2021,12 +2098,16 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
rebalance_array(local_coarse_2d,coarse_f,my_comm,FCoarse); rebalance_array(local_coarse_2d,coarse_f,my_comm,FCoarse);
if (coarse_solver) { if (coarse_solver) {
// if (master()) fprintf(stderr,"F[%d] Recursive(V) solve begin\n",size_x); // if (master()) fprintf(stderr,"F[%d] Recursive(V) solve begin\n",size_x);
//timing_pop(); // _cycle
coarse_solver->_cycle(CYCLE_V,coarse_f,coarse_u, coarse_solver->_cycle(CYCLE_V,coarse_f,coarse_u,
0,coarse_extra_out,pre,mid,post); 0,coarse_extra_out,pre,mid,post);
//timing_push("_cycle");
// if (master()) fprintf(stderr,"F[%d] Recursive(V) solve end\n",size_x); // if (master()) fprintf(stderr,"F[%d] Recursive(V) solve end\n",size_x);
} else { } else {
} }
timing_push("cycle_sync");
MPI_Bcast(&coarse_extra_out,1,MPI_DOUBLE,0,my_comm); MPI_Bcast(&coarse_extra_out,1,MPI_DOUBLE,0,my_comm);
timing_pop();
rebalance_array(coarse_u,local_coarse_2d,my_comm,UCoarse); rebalance_array(coarse_u,local_coarse_2d,my_comm,UCoarse);
interpolate_grid(correction); interpolate_grid(correction);
{ {
...@@ -2059,6 +2140,7 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> & ...@@ -2059,6 +2140,7 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
// SYNC(cerr << myrank << ": " << u(u.ubound(firstDim),0) << endl;cerr.flush();) // SYNC(cerr << myrank << ": " << u(u.ubound(firstDim),0) << endl;cerr.flush();)
// Et voila, return // Et voila, return
//timing_pop(); // _cycle
return; return;
} }
...@@ -2068,6 +2150,7 @@ void MG_Solver::build_sparse_operator() { ...@@ -2068,6 +2150,7 @@ void MG_Solver::build_sparse_operator() {
// than having to deal with "influences by a grid point" one at a time. // than having to deal with "influences by a grid point" one at a time.
#define cell_index(a,b) (((a)*size_z)+(b)) #define cell_index(a,b) (((a)*size_z)+(b))
timing_push("coarse_factor");
assert(nproc == 1); assert(nproc == 1);
// fprintf(stderr,"Sparse building: (%dx%d) %d %d %d\n",size_x,size_z,int(any_dxz),int(bc_tangent),int(bc_normal)); // fprintf(stderr,"Sparse building: (%dx%d) %d %d %d\n",size_x,size_z,int(any_dxz),int(bc_tangent),int(bc_normal));
// double now = MPI_Wtime(); // double now = MPI_Wtime();
...@@ -2953,6 +3036,7 @@ void MG_Solver::build_sparse_operator() { ...@@ -2953,6 +3036,7 @@ void MG_Solver::build_sparse_operator() {
} }
// double later = MPI_Wtime(); // double later = MPI_Wtime();
// fprintf(stderr,"Coarse operator construction took %g sec\n",later-now); // fprintf(stderr,"Coarse operator construction took %g sec\n",later-now);
timing_pop(); // coarse_factor
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment