Commit 05c9bd53 authored by Christopher Subich's avatar Christopher Subich
Browse files

Merge branch 'instrument'

Conflict in Makefile, fixed with formatting corrections
parents ae19aedf d68f6d2b
...@@ -14,6 +14,9 @@ OPTIM?=true ...@@ -14,6 +14,9 @@ OPTIM?=true
# longer for compilation # longer for compilation
SLOW_OPTIM?=false SLOW_OPTIM?=false
# Compile with instrumentation support for timing
TIMINGS?=false
# If MPICXX isn't separately defined, then set it to be the same # If MPICXX isn't separately defined, then set it to be the same
# as CXX # as CXX
ifeq ($(strip $(MPICXX)),) ifeq ($(strip $(MPICXX)),)
...@@ -36,6 +39,11 @@ ifeq ($(OPTIM),true) ...@@ -36,6 +39,11 @@ ifeq ($(OPTIM),true)
endif endif
endif endif
ifeq ($(TIMINGS),true)
CFLAGS:=$(CFLAGS) -DTIMING_ENABLE
endif
INCLUDE_DIRS := -I../include $(MPI_INCDIR) $(LAPACK_INCDIR) $(BLITZ_INCDIR) $(FFTW_INCDIR) $(UMF_INCDIR) INCLUDE_DIRS := -I../include $(MPI_INCDIR) $(LAPACK_INCDIR) $(BLITZ_INCDIR) $(FFTW_INCDIR) $(UMF_INCDIR)
CFLAGS := $(CFLAGS) $(INCLUDE_DIRS) CFLAGS := $(CFLAGS) $(INCLUDE_DIRS)
...@@ -70,11 +78,11 @@ derek_x: derek.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o Splits.o Par ...@@ -70,11 +78,11 @@ derek_x: derek.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o Splits.o Par
$(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS) $(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS)
cases/%.x: cases/%.o cases/%.src.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o \ cases/%.x: cases/%.o cases/%.src.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o \
Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o Sorter.o Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o timing.o
$(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS) $(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS)
cases/%_x: cases/%.o cases/%.src.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o \ cases/%_x: cases/%.o cases/%.src.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o \
Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o Sorter.o Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o timing.o
$(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS) $(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS)
tests/test%.src.c: tests/test%.cpp CaseFileSource.c tests/test%.src.c: tests/test%.cpp CaseFileSource.c
...@@ -90,4 +98,6 @@ cases/%.src.c: cases/%.cpp CaseFileSource.c ...@@ -90,4 +98,6 @@ cases/%.src.c: cases/%.cpp CaseFileSource.c
%.o: %.cpp *.hpp %.o: %.cpp *.hpp
$(MPICXX) $(CFLAGS) -o $@ -c $< $(MPICXX) $(CFLAGS) -o $@ -c $<
print-% : ; @echo $* = $($*)
.SECONDARY: .SECONDARY:
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <stdio.h> #include <stdio.h>
#include <iostream> #include <iostream>
#include <complex> #include <complex>
#include "timing.hpp"
using namespace std; using namespace std;
using namespace blitz; using namespace blitz;
...@@ -182,8 +183,10 @@ void Transposer<T>::transpose(Array<T,3> & source, Array<T,3> & dest) { ...@@ -182,8 +183,10 @@ void Transposer<T>::transpose(Array<T,3> & source, Array<T,3> & dest) {
// Pointers to indirect the source and dest arrays, if we need to use // Pointers to indirect the source and dest arrays, if we need to use
// temporaries // temporaries
/* First, if we only have one processor, just copy */ /* First, if we only have one processor, just copy */
timing_push("transpose");
if (num_proc == 1) { if (num_proc == 1) {
dest = source; dest = source;
timing_pop();
return; return;
} }
Array<T,3> * source_pointer, * dest_pointer; Array<T,3> * source_pointer, * dest_pointer;
...@@ -237,6 +240,7 @@ void Transposer<T>::transpose(Array<T,3> & source, Array<T,3> & dest) { ...@@ -237,6 +240,7 @@ void Transposer<T>::transpose(Array<T,3> & source, Array<T,3> & dest) {
if (&dstarray != &dest) { if (&dstarray != &dest) {
dest = dstarray; dest = dstarray;
} }
timing_pop();
} }
template <class T> template <class T>
void Transposer<T>::back_transpose(Array<T,3> & r_source, Array<T,3> & r_dest) { void Transposer<T>::back_transpose(Array<T,3> & r_source, Array<T,3> & r_dest) {
...@@ -248,8 +252,10 @@ void Transposer<T>::back_transpose(Array<T,3> & r_source, Array<T,3> & r_dest) { ...@@ -248,8 +252,10 @@ void Transposer<T>::back_transpose(Array<T,3> & r_source, Array<T,3> & r_dest) {
rever_transpose, with r_source (reverse source) and r_dest (reverse destination) */ rever_transpose, with r_source (reverse source) and r_dest (reverse destination) */
/* First, if we only have one processor, just copy */ /* First, if we only have one processor, just copy */
timing_push("transpose");
if (num_proc == 1) { if (num_proc == 1) {
r_dest = r_source; r_dest = r_source;
timing_pop();
return; return;
} }
// Pointers to indirect the source and dest arrays, if we need to use // Pointers to indirect the source and dest arrays, if we need to use
...@@ -311,6 +317,7 @@ void Transposer<T>::back_transpose(Array<T,3> & r_source, Array<T,3> & r_dest) { ...@@ -311,6 +317,7 @@ void Transposer<T>::back_transpose(Array<T,3> & r_source, Array<T,3> & r_dest) {
if (&dstarray != &r_dest) { if (&dstarray != &r_dest) {
r_dest = dstarray; r_dest = dstarray;
} }
timing_pop();
} }
template<class T> template<class T>
......
...@@ -12,6 +12,9 @@ ...@@ -12,6 +12,9 @@
#include <random/normal.h> #include <random/normal.h>
#include <vector> #include <vector>
#include "../Science.hpp" #include "../Science.hpp"
#include "../multigrid.hpp"
#include "../grad.hpp"
#include "../gmres.hpp"
using namespace std; using namespace std;
using namespace TArrayn; using namespace TArrayn;
...@@ -310,6 +313,14 @@ int main(int argc, char ** argv) { ...@@ -310,6 +313,14 @@ int main(int argc, char ** argv) {
ppois.do_run(0.001); ppois.do_run(0.001);
now = MPI_Wtime(); now = MPI_Wtime();
if (master()) fprintf(stderr,"Total runtime (%d iterations) complete in %gs (%gs per)\n",mycode.itercount,now-start_time,(now-start_time)/mycode.itercount); if (master()) fprintf(stderr,"Total runtime (%d iterations) complete in %gs (%gs per)\n",mycode.itercount,now-start_time,(now-start_time)/mycode.itercount);
if (master()) fprintf(stderr," %d invocations of coarse solve, for %gs (%gs per)\n",
coarse_solve_count,coarse_solve_time,coarse_solve_time/coarse_solve_count);
if (master()) fprintf(stderr," %d red black smoothings, for %gs (%gs per)\n",
redblack_count, redblack_time, redblack_time/redblack_count);
if (master()) fprintf(stderr," %d FD operator applications, for %gs (%gs per)\n",
apply_op_count, apply_op_time, apply_op_time/apply_op_count);
if (master()) fprintf(stderr," %gs spent in spectral derivatives\n",deriv_time);
if (master()) fprintf(stderr," %gs spent in gmres-lapack internals\n",gmres_lapack_time);
MPI_Finalize(); MPI_Finalize();
return 0; return 0;
} }
......
This diff is collapsed.
restart = false
Lx = 1.0
Ly = 1.0
Lz = 1.0
Nx = 1024
Ny = 1
Nz = 1024
rho_0 = 1.0
delta_rho = 0.01
pyc_asym = 0.0
pyc_sep_perc = 0.0
h_perc =0.1
h_mix_perc = 0.1
delta_x = 0.01
Lmix = 0.1
Hmix = 0.1
hill_height = 0.1
hill_centre = 0.5
hill_width = 0.1
visco = 0.0001
diffu_rho = 0.00001
plot_interval = 1.0
final_time = 0.5
perturb = 0.0001
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <blitz/array.h> #include <blitz/array.h>
#include <stdio.h> #include <stdio.h>
#include <iostream> #include <iostream>
#include <mpi.h>
#include "timing.hpp"
//#include <mkl_lapack.h> //#include <mkl_lapack.h>
/* LAPACK function prototype */ /* LAPACK function prototype */
extern "C" { extern "C" {
...@@ -13,6 +15,7 @@ extern "C" { ...@@ -13,6 +15,7 @@ extern "C" {
int * LDB, double * S, double * RCOND, int * RANK, double * WORK, int * LDB, double * S, double * RCOND, int * RANK, double * WORK,
int * LWORK, int * IWORK, int * INFO); int * LWORK, int * IWORK, int * INFO);
} }
/* gmres.hpp -- header class for gmres template class, which abstracts out the matrix-vector /* gmres.hpp -- header class for gmres template class, which abstracts out the matrix-vector
multiply, preconditioning, and dot products of GMRES to a user-specified type conforming multiply, preconditioning, and dot products of GMRES to a user-specified type conforming
to a provided interface. to a provided interface.
...@@ -133,13 +136,18 @@ template <class Controller> class GMRES_Solver { ...@@ -133,13 +136,18 @@ template <class Controller> class GMRES_Solver {
int innerits = 0; // count number of inner iterations int innerits = 0; // count number of inner iterations
for (int i = 0; i < outer; i++) { for (int i = 0; i < outer; i++) {
/* Call the inner iteration */ /* Call the inner iteration */
timing_push("gmres_outer");
timing_push("gmres_inner");
innerits += gmres_inner(rnew, xinner, inner, prec*start_norm); innerits += gmres_inner(rnew, xinner, inner, prec*start_norm);
timing_pop();
if (i != 0) { if (i != 0) {
ops->basis_fma(x,xinner,1); // Add the result to x ops->basis_fma(x,xinner,1); // Add the result to x
} else { } else {
ops->basis_copy(x,xinner); ops->basis_copy(x,xinner);
} }
timing_push("gmres_op");
ops->matrix_multiply(x,rapply); // M*x = Rapply ops->matrix_multiply(x,rapply); // M*x = Rapply
timing_pop();
ops->resid_copy(rnew,b); /* Rnew = b */ ops->resid_copy(rnew,b); /* Rnew = b */
ops->resid_fma(rnew,rapply,-1); /* Rnew = b - A*x */ ops->resid_fma(rnew,rapply,-1); /* Rnew = b - A*x */
error_norm = sqrt(ops->resid_dot(rnew,rnew)); error_norm = sqrt(ops->resid_dot(rnew,rnew));
...@@ -155,8 +163,10 @@ template <class Controller> class GMRES_Solver { ...@@ -155,8 +163,10 @@ template <class Controller> class GMRES_Solver {
if (err_out) { if (err_out) {
*err_out = error_norm/start_norm; *err_out = error_norm/start_norm;
} }
timing_pop();
return(innerits); return(innerits);
} }
timing_pop();
} }
/* Free temporaries */ /* Free temporaries */
ops->free_resid(rnew); ops->free_resid(rnew);
...@@ -274,11 +284,16 @@ template <class Controller> class GMRES_Solver { ...@@ -274,11 +284,16 @@ template <class Controller> class GMRES_Solver {
double remaining_error = 1; double remaining_error = 1;
for (my_it = 1; my_it <= num_its; my_it++) { for (my_it = 1; my_it <= num_its; my_it++) {
/* Find an x vector that approximately solves the prior basis */ /* Find an x vector that approximately solves the prior basis */
timing_push("gmres_precond");
ops->precondition(resid_vec(my_it-1), basis_vec(my_it-1)); ops->precondition(resid_vec(my_it-1), basis_vec(my_it-1));
timing_pop();
timing_push("gmres_matmul");
/* Apply the operator the current basis vector */ /* Apply the operator the current basis vector */
ops->matrix_multiply(basis_vec(my_it-1),resid_vec(my_it)); ops->matrix_multiply(basis_vec(my_it-1),resid_vec(my_it));
timing_pop();
// cout << resid_vec[my_it]; // cout << resid_vec[my_it];
/* Grahm-Schmidt orthogonalization */ /* Grahm-Schmidt orthogonalization */
timing_push("gmres_dot");
for (int k = 1; k <= my_it; k++) { for (int k = 1; k <= my_it; k++) {
double dot = ops->resid_dot(resid_vec(k-1),resid_vec(my_it)); double dot = ops->resid_dot(resid_vec(k-1),resid_vec(my_it));
//cout << "**" << dot << "**\n"; //cout << "**" << dot << "**\n";
...@@ -295,8 +310,10 @@ template <class Controller> class GMRES_Solver { ...@@ -295,8 +310,10 @@ template <class Controller> class GMRES_Solver {
ops->resid_scale(resid_vec(my_it),1/norm); ops->resid_scale(resid_vec(my_it),1/norm);
//cout << resid_vec[my_it]; //cout << resid_vec[my_it];
hess(my_it+1,my_it) = norm; hess(my_it+1,my_it) = norm;
timing_pop();
/* Now, solve the least-squares problem with LAPACK */ /* Now, solve the least-squares problem with LAPACK */
timing_push("gmres_hess");
{ {
/* LAPACK overwrites the matrix, so copy */ /* LAPACK overwrites the matrix, so copy */
// std::cerr.precision(8); // std::cerr.precision(8);
...@@ -322,6 +339,7 @@ template <class Controller> class GMRES_Solver { ...@@ -322,6 +339,7 @@ template <class Controller> class GMRES_Solver {
// std::cerr << rhs_vec; // std::cerr << rhs_vec;
remaining_error = fabs(rhs_vec(my_it+1)); remaining_error = fabs(rhs_vec(my_it+1));
} }
timing_pop();
// if (abs(rhs_vec(my_it)) < 1e-14*max(abs(rhs_vec))) { // if (abs(rhs_vec(my_it)) < 1e-14*max(abs(rhs_vec))) {
/* If the last rhs vector is much smaller than the maximum, /* If the last rhs vector is much smaller than the maximum,
we've either stagnated with convergence or we're running into we've either stagnated with convergence or we're running into
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "Parformer.hpp" #include "Parformer.hpp"
#include "gmres_2d_solver.hpp" #include "gmres_2d_solver.hpp"
#include "Par_util.hpp" #include "Par_util.hpp"
#include "timing.hpp"
// private interface for gmres-solving class // private interface for gmres-solving class
#include "gmres_2d_solver_impl.hpp" #include "gmres_2d_solver_impl.hpp"
...@@ -823,7 +824,9 @@ void Cheb_2dmg::precondition(fbox * & rhs, ubox * & soln) { ...@@ -823,7 +824,9 @@ void Cheb_2dmg::precondition(fbox * & rhs, ubox * & soln) {
// SYNC(cerr << *r2d;cerr.flush()); // SYNC(cerr << *r2d;cerr.flush());
// fprintf(stderr,"Mean condition: %g\n",rhs->mean); // fprintf(stderr,"Mean condition: %g\n",rhs->mean);
//fprintf(stderr,"Precond: max rhs %g\n",max(abs(*(rhs->gridbox)))); //fprintf(stderr,"Precond: max rhs %g\n",max(abs(*(rhs->gridbox))));
timing_push("mg_cycle");
mg_precond->cycle(CYCLE_F,*r2d,*s2d, rhs->mean,soln->sigma,0,1,2); mg_precond->cycle(CYCLE_F,*r2d,*s2d, rhs->mean,soln->sigma,0,1,2);
timing_pop();
//mg_precond->cycle(CYCLE_V,*r2d,*s2d, rhs->mean,soln->sigma,1,0,0); //mg_precond->cycle(CYCLE_V,*r2d,*s2d, rhs->mean,soln->sigma,1,0,0);
//fprintf(stderr,"Precond: max soln %g\n",max(abs(*s2d))); //fprintf(stderr,"Precond: max soln %g\n",max(abs(*s2d)));
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "Par_util.hpp" #include "Par_util.hpp"
#include "T_util.hpp" #include "T_util.hpp"
#include <stdio.h> #include <stdio.h>
#include "timing.hpp"
using namespace Transformer; using namespace Transformer;
namespace TArrayn { namespace TArrayn {
...@@ -135,6 +136,7 @@ namespace TArrayn { ...@@ -135,6 +136,7 @@ namespace TArrayn {
/* Private function to save boilerplate of calculating d/d[abc] with the /* Private function to save boilerplate of calculating d/d[abc] with the
right kind of transform */ right kind of transform */
void Grad::calculate_d(S_EXP type, Trans1D * tform, blitz::Array<double,3> * restrict dest) { void Grad::calculate_d(S_EXP type, Trans1D * tform, blitz::Array<double,3> * restrict dest) {
timing_push("grad_deriv");
switch(type) { switch(type) {
case COSINE: case COSINE:
deriv_dct(*my_array,*tform,*dest); deriv_dct(*my_array,*tform,*dest);
...@@ -152,6 +154,7 @@ namespace TArrayn { ...@@ -152,6 +154,7 @@ namespace TArrayn {
fprintf(stderr,"Invalid expansion type (%d) in grad!\n",type); fprintf(stderr,"Invalid expansion type (%d) in grad!\n",type);
abort(); abort();
} }
timing_pop();
} }
void Grad::do_diff(DTArray * restrict dx_out, Dimension inDim) { void Grad::do_diff(DTArray * restrict dx_out, Dimension inDim) {
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "Parformer.hpp" #include "Parformer.hpp"
#include <blitz/array.h> #include <blitz/array.h>
extern double deriv_time;
namespace TArrayn { namespace TArrayn {
/* Include in the TArrayn namespace, since this is used much like /* Include in the TArrayn namespace, since this is used much like
deriv_cheb / deriv_fft / etc */ deriv_cheb / deriv_fft / etc */
......
...@@ -6,10 +6,12 @@ ...@@ -6,10 +6,12 @@
#include <blitz/tinyvec-et.h> #include <blitz/tinyvec-et.h>
#include "umfpack.h" #include "umfpack.h"
#include "timing.hpp"
using namespace blitz; using namespace blitz;
using namespace std; using namespace std;
//#define COARSE_GRID_SIZE 512 //#define COARSE_GRID_SIZE 512
#define COARSE_GRID_SIZE 8 #define COARSE_GRID_SIZE 8
#define SYNC(__x__) { for (int _i = 0; _i < nproc; _i++) { \ #define SYNC(__x__) { for (int _i = 0; _i < nproc; _i++) { \
...@@ -155,6 +157,7 @@ void MG_Solver::rebalance_line(Array<double,1> & orig, Array<double,1> & balance ...@@ -155,6 +157,7 @@ void MG_Solver::rebalance_line(Array<double,1> & orig, Array<double,1> & balance
if (balance.extent(firstDim) > 0) balance = b_2d(Range::all(),0); if (balance.extent(firstDim) > 0) balance = b_2d(Range::all(),0);
} }
void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balance, MPI_Comm c, BalanceGroup btype) { void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balance, MPI_Comm c, BalanceGroup btype) {
timing_push("rebalance");
int o_lbound = orig.lbound(firstDim); int o_lbound = orig.lbound(firstDim);
int o_ubound = orig.ubound(firstDim); int o_ubound = orig.ubound(firstDim);
int b_lbound = balance.lbound(firstDim); int b_lbound = balance.lbound(firstDim);
...@@ -248,8 +251,11 @@ void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balanc ...@@ -248,8 +251,11 @@ void MG_Solver::rebalance_array(Array<double,2> & orig, Array<double,2> & balanc
assert(orig.size() == s_total && balance.size() == r_total); assert(orig.size() == s_total && balance.size() == r_total);
timing_push("rebalance_mpi");
MPI_Alltoallv(orig_data,s_counts,s_displs,MPI_DOUBLE, MPI_Alltoallv(orig_data,s_counts,s_displs,MPI_DOUBLE,
balance.data(),r_counts,r_displs,MPI_DOUBLE,c); balance.data(),r_counts,r_displs,MPI_DOUBLE,c);
timing_pop(); // rebalance_mpi
timing_pop(); // rebalance
// And done // And done
} }
void MG_Solver::set_x_symmetry(SYM_TYPE s) { void MG_Solver::set_x_symmetry(SYM_TYPE s) {
...@@ -521,6 +527,69 @@ MG_Solver::MG_Solver(Array<double,1> xvals, blitz::Array<double,1> zvals, ...@@ -521,6 +527,69 @@ MG_Solver::MG_Solver(Array<double,1> xvals, blitz::Array<double,1> zvals,
} }
/* Line solve */ /* Line solve */
#if 1
void line_solve(Array<double,1> & u, Array<double,1> & f,
Array<double,1> & A, Array<double,1> & B,
Array<double,1> & C, double a0, double a1,
double b0, double b1, Array<double,2> & Dz,
Array<double,2> & Dzz) {
/* The objective here is to solve the line operator as a tridiangonal banded
matrix. While LAPACK has solvers for this, timing suggests that the basic
banded solver is much slower than expected. Problems that arise in SPINS
should not be difficult ones that require partial pivoting. The problem
to solve is:
A(z)*Dzz*u + B(z)*dz*u + C(z)*u = f(z), with
a0*u + b0*Dz*u = f0 @ z=0 (bottom) and
a1*u + b1*Dz*u = f1 @ z=1 (top) */
timing_push("line_solve");
// Build the band matrix itself, re-using the LAPACK formulation
blitz::firstIndex ii;
blitz::Range all = Range::all();
static Array<double,1> an(u.extent(firstDim)), bn(u.extent(firstDim)), cn(u.extent(firstDim));
int top = u.ubound(firstDim);
an = 0; bn = 0; cn = 0;
/* Set the band matrix based on our input parameters */
// Main diagonal
bn(all) = A*Dzz(all,1) + B*Dz(all,1) + C;
// Subdiagonal
an(all) = A*Dzz(all,0) + B*Dz(all,0);
// Superdiagonal
cn(all) = A*Dzz(all,2) + B*Dz(all,2);
/* BC's */
/* Bottom -- a0*u+b0*Dz*u */
bn(0) = a0 + b0*Dz(0,1); // Main diagonal from Dz
cn(0) = b0*Dz(0,2); // Superdiagonal from Dz
an(0) = 0;
/* Top -- a1*u + b1*Dz*u */
bn(top) = a1 + b1*Dz(top,1);
an(top) = b1*Dz(top,0); // Subdiagonal from Dz
cn(top) = 0;
// Thomas algorithm, via Wikipedia, note base-zero indexing
cn(0) = cn(0)/bn(0);
u(0) = f(0)/bn(0); // Use u for d', which will be overwritten by the final sol'n
for (int i = 1; i < top; i++) {
cn(i) = cn(i)/(bn(i)-an(i)*cn(i-1));
u(i) = (f(i) - an(i)*u(i-1))/(bn(i)-an(i)*cn(i-1));
}
u(top) = (f(top)-an(top)*u(top-1))/(bn(top)-an(top)*cn(top-1));
// Back substitution
for (int i = top-1; i >= 0; i--) {
u(i) = u(i) - cn(i)*u(i+1);
}
timing_pop(); // line_solve
}
#else
void line_solve(Array<double,1> & u, Array<double,1> & f, void line_solve(Array<double,1> & u, Array<double,1> & f,
Array<double,1> & A, Array<double,1> & B, Array<double,1> & A, Array<double,1> & B,
Array<double,1> & C, double a0, double a1, Array<double,1> & C, double a0, double a1,
...@@ -539,6 +608,7 @@ void line_solve(Array<double,1> & u, Array<double,1> & f, ...@@ -539,6 +608,7 @@ void line_solve(Array<double,1> & u, Array<double,1> & f,
/* Allocate the operator as static, since it's large-ish */ /* Allocate the operator as static, since it's large-ish */
// cout << "Solving: A: " << A << "B: " << B << "C: " << C << "f: " << f; // cout << "Solving: A: " << A << "B: " << B << "C: " << C << "f: " << f;
// fprintf(stdout,"BCs: %f %f, %f %f\n",a0,a0,b0,b1); // fprintf(stdout,"BCs: %f %f, %f %f\n",a0,a0,b0,b1);
timing_push("line_solve");
blitz::firstIndex ii; blitz::secondIndex jj; blitz::firstIndex ii; blitz::secondIndex jj;
static Array<double,2> band_matrix(4,u.extent(firstDim),blitz::columnMajorArray); static Array<double,2> band_matrix(4,u.extent(firstDim),blitz::columnMajorArray);
band_matrix = 0; band_matrix = 0;
...@@ -575,6 +645,7 @@ void line_solve(Array<double,1> & u, Array<double,1> & f, ...@@ -575,6 +645,7 @@ void line_solve(Array<double,1> & u, Array<double,1> & f,
u = f; u = f;
// LAPACK call // LAPACK call
// cout << "Band matrix: " << band_matrix; // cout << "Band matrix: " << band_matrix;
timing_push("line_solve_lapack");
dgbsv_(&N, &KL, &KU, &NRHS, band_matrix.data(), &LDAB, IPIV, u.data(), &LDB, &INFO); dgbsv_(&N, &KL, &KU, &NRHS, band_matrix.data(), &LDAB, IPIV, u.data(), &LDB, &INFO);
// cout << "Returning u:" << u << "Info " << INFO << endl; // cout << "Returning u:" << u << "Info " << INFO << endl;
if (INFO != 0) { if (INFO != 0) {
...@@ -583,8 +654,11 @@ void line_solve(Array<double,1> & u, Array<double,1> & f, ...@@ -583,8 +654,11 @@ void line_solve(Array<double,1> & u, Array<double,1> & f,
cerr << band_matrix; cerr << band_matrix;
cerr << f; cerr << f;
} }
timing_pop();
assert(INFO == 0); assert(INFO == 0);
timing_pop();
} }
#endif
/* Copy coefficients to the local object, and coarsen for the subproblem */ /* Copy coefficients to the local object, and coarsen for the subproblem */
void MG_Solver::problem_setup(Array<double,2> & Uxx, Array<double,2> & Uzz, void MG_Solver::problem_setup(Array<double,2> & Uxx, Array<double,2> & Uzz,
...@@ -714,6 +788,7 @@ void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) { ...@@ -714,6 +788,7 @@ void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) {
/* Assuming that the x-array is properly load balanced for our communicator, /* Assuming that the x-array is properly load balanced for our communicator,
apply the operator A*x = f */ apply the operator A*x = f */
/* Local arrays for left and right x lines, for parallel communication */ /* Local arrays for left and right x lines, for parallel communication */
timing_push("apply_operator");
Array<double,1> uin_left(size_z) , uin_right(size_z) ; Array<double,1> uin_left(size_z) , uin_right(size_z) ;
uin_left = -9e9; uin_right = -9e9; uin_left = -9e9; uin_right = -9e9;
...@@ -739,12 +814,14 @@ void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) { ...@@ -739,12 +814,14 @@ void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) {
} }
/* Now, send left points, and receive right points */ /* Now, send left points, and receive right points */
// fprintf(stderr,"%d sending left (%d, %d)\n",myrank,lefty,righty); // fprintf(stderr,"%d sending left (%d, %d)\n",myrank,lefty,righty);
timing_push("apply_op_mpi");
MPI_Sendrecv(&u(u.lbound(firstDim),0),size_z,MPI_DOUBLE,lefty,0, MPI_Sendrecv(&u(u.lbound(firstDim),0),size_z,MPI_DOUBLE,lefty,0,
uin_right.data(),size_z,MPI_DOUBLE,righty,0,my_comm,&ignoreme); uin_right.data(),size_z,MPI_DOUBLE,righty,0,my_comm,&ignoreme);
/* And vice versa -- send right, receive left */ /* And vice versa -- send right, receive left */
// fprintf(stderr,"%d sending right (%d, %d)\n",myrank,righty,lefty); // fprintf(stderr,"%d sending right (%d, %d)\n",myrank,righty,lefty);
MPI_Sendrecv(&u(u.ubound(firstDim),0),size_z,MPI_DOUBLE,righty,0, MPI_Sendrecv(&u(u.ubound(firstDim),0),size_z,MPI_DOUBLE,righty,0,
uin_left.data(),size_z,MPI_DOUBLE,lefty,0,my_comm,&ignoreme); uin_left.data(),size_z,MPI_DOUBLE,lefty,0,my_comm,&ignoreme);
timing_pop();
// fprintf(stderr,"%d received:\n"); // fprintf(stderr,"%d received:\n");
// fprintf(stderr,"%d done\n",myrank); // fprintf(stderr,"%d done\n",myrank);
} }
...@@ -935,6 +1012,7 @@ void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) { ...@@ -935,6 +1012,7 @@ void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) {
u(i,j-1)*uz_top(i)*Dz(j,0) + u(i-1,j)*ux_top(i)*Dx(i,0); u(i,j-1)*uz_top(i)</