Commit 0a4781c9 authored by Christopher Subich's avatar Christopher Subich
Browse files

Stack-structured timing code

This commit adds stack-structured timing code, such that bits of SPINS
can be instrumented with

>  timing_push("short name")
>  [... work goes here]
>  timing_pop()

This is recursive, such that already-timed portions of the code can call
the timing code to further instrument subroutines or discrete code
segments, and the resulting structure follows the -dynamic- call stack.
This does have the odd result that portions of the code that are re-used
(namely spectral derivatives) can show up as several 'leaf' nodes in the
graph.

To print out the report to standard error, call

>  timing_stack_report()

The exact format of the report should be considered tentative.
parent 386838b6
......@@ -57,7 +57,7 @@ NSIntegrator.o: NSIntegrator.cpp NSIntegrator_impl.cc
tests/test%.o: tests/test%.cpp
$(MPICXX) $(CFLAGS) -o $@ -c $<
tests/test%_x: tests/test%.o tests/test%.src.o TArray.o Parformer.o T_util.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o
tests/test%_x: tests/test%.o tests/test%.src.o TArray.o Parformer.o T_util.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o
$(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS)
cases/%.o: cases/%.cpp NSIntegrator_impl.cc NSIntegrator.hpp
......@@ -69,10 +69,10 @@ nonhydro_x: nonhydro_sw.o TArray.o T_util.o Parformer.o Splits.o Par_util.o Spli
derek_x: derek.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o
$(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS)
cases/%.x: cases/%.o cases/%.src.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o
cases/%.x: cases/%.o cases/%.src.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o timing.o
$(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS)
cases/%_x: cases/%.o cases/%.src.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o
cases/%_x: cases/%.o cases/%.src.o TArray.o T_util.o Parformer.o ESolver.o Timestep.o NSIntegrator.o BaseCase.o Science.o Splits.o Par_util.o Split_reader.o gmres.o gmres_1d_solver.o gmres_2d_solver.o grad.o multigrid.o Options.o timing.o
$(LD) $(LDFLAGS) -o $@ $^ $(LDLIBS)
tests/test%.src.c: tests/test%.cpp CaseFileSource.c
......
......@@ -19,9 +19,7 @@
#include <stdio.h>
#include <iostream>
#include <fstream>
#include "../gmres.hpp"
#include "../multigrid.hpp"
#include "../grad.hpp"
#include "../timing.hpp"
//#include <stdlib.h>
using namespace std;
......@@ -409,6 +407,7 @@ int main(int argc, char ** argv) {
/* Initialize MPI. This is required even for single-processor runs,
since the inner routines assume some degree of parallelization,
even if it is trivial. */
f_order = 4; f_cutoff = 0.8; f_strength = -0.33;
MPI_Init(&argc, &argv);
......@@ -579,16 +578,10 @@ int main(int argc, char ** argv) {
// Run until the end of time
do_mapiw.do_run(final_time);
double now = MPI_Wtime();
if (master()) fprintf(stderr,"Total runtime complete in %gs\n",now-start_time);
if (master()) fprintf(stderr," %d invocations of coarse solve, for %gs (%gs per)\n",
coarse_solve_count,coarse_solve_time,coarse_solve_time/coarse_solve_count);
if (master()) fprintf(stderr," %d red black smoothings, for %gs (%gs per)\n",
redblack_count, redblack_time, redblack_time/redblack_count);
if (master()) fprintf(stderr," %d FD operator applications, for %gs (%gs per)\n",
apply_op_count, apply_op_time, apply_op_time/apply_op_count);
if (master()) fprintf(stderr," %gs spent in spectral derivatives\n",deriv_time);
if (master()) fprintf(stderr," %gs spent in gmres-lapack internals\n",gmres_lapack_time);
if (master()) timing_stack_report();
MPI_Finalize(); // Cleanly exit MPI
return 0; // End the program
}
#include "gmres.hpp"
double gmres_lapack_time = 0;
// Defines whether GMRES should be chatty about output, for debugging purposes
......@@ -5,6 +5,7 @@
#include <stdio.h>
#include <iostream>
#include <mpi.h>
#include "timing.hpp"
//#include <mkl_lapack.h>
/* LAPACK function prototype */
extern "C" {
......@@ -15,7 +16,6 @@ extern "C" {
int * LWORK, int * IWORK, int * INFO);
}
extern double gmres_lapack_time;
/* gmres.hpp -- header class for gmres template class, which abstracts out the matrix-vector
multiply, preconditioning, and dot products of GMRES to a user-specified type conforming
to a provided interface.
......@@ -136,13 +136,18 @@ template <class Controller> class GMRES_Solver {
int innerits = 0; // count number of inner iterations
for (int i = 0; i < outer; i++) {
/* Call the inner iteration */
timing_push("gmres_outer");
timing_push("gmres_inner");
innerits += gmres_inner(rnew, xinner, inner, prec*start_norm);
timing_pop();
if (i != 0) {
ops->basis_fma(x,xinner,1); // Add the result to x
} else {
ops->basis_copy(x,xinner);
}
timing_push("gmres_op");
ops->matrix_multiply(x,rapply); // M*x = Rapply
timing_pop();
ops->resid_copy(rnew,b); /* Rnew = b */
ops->resid_fma(rnew,rapply,-1); /* Rnew = b - A*x */
error_norm = sqrt(ops->resid_dot(rnew,rnew));
......@@ -158,8 +163,10 @@ template <class Controller> class GMRES_Solver {
if (err_out) {
*err_out = error_norm/start_norm;
}
timing_pop();
return(innerits);
}
timing_pop();
}
/* Free temporaries */
ops->free_resid(rnew);
......@@ -277,11 +284,16 @@ template <class Controller> class GMRES_Solver {
double remaining_error = 1;
for (my_it = 1; my_it <= num_its; my_it++) {
/* Find an x vector that approximately solves the prior basis */
timing_push("gmres_precond");
ops->precondition(resid_vec(my_it-1), basis_vec(my_it-1));
timing_pop();
timing_push("gmres_matmul");
/* Apply the operator the current basis vector */
ops->matrix_multiply(basis_vec(my_it-1),resid_vec(my_it));
timing_pop();
// cout << resid_vec[my_it];
/* Grahm-Schmidt orthogonalization */
timing_push("gmres_dot");
for (int k = 1; k <= my_it; k++) {
double dot = ops->resid_dot(resid_vec(k-1),resid_vec(my_it));
//cout << "**" << dot << "**\n";
......@@ -298,8 +310,10 @@ template <class Controller> class GMRES_Solver {
ops->resid_scale(resid_vec(my_it),1/norm);
//cout << resid_vec[my_it];
hess(my_it+1,my_it) = norm;
timing_pop();
/* Now, solve the least-squares problem with LAPACK */
timing_push("gmres_hess");
{
/* LAPACK overwrites the matrix, so copy */
// std::cerr.precision(8);
......@@ -316,17 +330,16 @@ template <class Controller> class GMRES_Solver {
// dgels_("N",&M,&N,&NRHS,hess_copy.data(),&LDA,
// rhs_vec.data(), &LDB, lapack_workspace, &lwork_size,
// &INFO);
double now = MPI_Wtime();
dgelsd_(&M,&N,&NRHS,hess_copy.data(),&LDA,rhs_vec.data(),
&LDB, svd_vec.data(), &RCOND, &RANK, lapack_workspace,
&lwork_size,lapack_iworkspace,&INFO);
gmres_lapack_time += (MPI_Wtime() - now);
// fprintf(stderr,"%s:%d RANK %d\n",__FILE__,__LINE__,RANK); fflush(stderr);
/* rhs_vec.data() contains the answer and remainder */
// std::cerr << svd_vec;
// std::cerr << rhs_vec;
remaining_error = fabs(rhs_vec(my_it+1));
}
timing_pop();
// if (abs(rhs_vec(my_it)) < 1e-14*max(abs(rhs_vec))) {
/* If the last rhs vector is much smaller than the maximum,
we've either stagnated with convergence or we're running into
......
......@@ -4,8 +4,7 @@
#include "Par_util.hpp"
#include "T_util.hpp"
#include <stdio.h>
double deriv_time = 0;
#include "timing.hpp"
using namespace Transformer;
namespace TArrayn {
......@@ -137,7 +136,7 @@ namespace TArrayn {
/* Private function to save boilerplate of calculating d/d[abc] with the
right kind of transform */
void Grad::calculate_d(S_EXP type, Trans1D * tform, blitz::Array<double,3> * restrict dest) {
double now = MPI_Wtime();
timing_push("grad_deriv");
switch(type) {
case COSINE:
deriv_dct(*my_array,*tform,*dest);
......@@ -155,7 +154,7 @@ namespace TArrayn {
fprintf(stderr,"Invalid expansion type (%d) in grad!\n",type);
abort();
}
deriv_time += (MPI_Wtime() - now);
timing_pop();
}
void Grad::do_diff(DTArray * restrict dx_out, Dimension inDim) {
......
......@@ -6,19 +6,11 @@
#include <blitz/tinyvec-et.h>
#include "umfpack.h"
#include "timing.hpp"
using namespace blitz;
using namespace std;
double coarse_solve_time = 0;
int coarse_solve_count = 0;
double redblack_time = 0;
int redblack_count = 0;
double apply_op_time = 0;
int apply_op_count = 0;
//#define COARSE_GRID_SIZE 512
#define COARSE_GRID_SIZE 8
......@@ -724,6 +716,7 @@ void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) {
/* Assuming that the x-array is properly load balanced for our communicator,
apply the operator A*x = f */
/* Local arrays for left and right x lines, for parallel communication */
timing_push("apply_operator");
Array<double,1> uin_left(size_z) , uin_right(size_z) ;
uin_left = -9e9; uin_right = -9e9;
......@@ -945,6 +938,7 @@ void MG_Solver::apply_operator(Array<double,2> & u, Array<double,2> & f) {
u(i,j-1)*uz_top(i)*Dz(j,0) + u(i-1,j)*ux_top(i)*Dx(i,0);
}
}
timing_pop();
}
......@@ -969,6 +963,7 @@ void MG_Solver::do_redblack(blitz::Array<double,2> & f, blitz::Array<double,2> &
an even number of points).
This is low-hanging fruit for simultaneous communication and computation. */
timing_push("redblack");
Array<double,1> ul_right(size_z), // Array for receiving right neighbour
u_coef(size_z), // coefficient for the u term
uz_coef(size_z), // ux_term
......@@ -1296,6 +1291,7 @@ void MG_Solver::do_redblack(blitz::Array<double,2> & f, blitz::Array<double,2> &
// Make sure the send completed
MPI_Wait(&send_req,&ignoreme);
timing_pop();
// fprintf(stdout,"Exiting Redblack %d/%d\n",myrank,nproc);
}
......@@ -1840,13 +1836,12 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
u_ptr = u.data();
f_ptr = f.data();
}
double now = MPI_Wtime();
timing_push("coarse solve");
int retval = umfpack_di_solve(sys,A_cols,A_rows,A_double,
u_ptr, f_ptr,
numeric_factor,
Control,Info);
coarse_solve_time += (MPI_Wtime() - now);
coarse_solve_count += 1;
timing_pop();
assert(!retval);
if (indefinite_problem) {
// fprintf(stderr,"CG: True u\n");
......@@ -1881,16 +1876,10 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
/* Perform one red-black relaxtion pass */
// if (master()) fprintf(stderr,"Pre-cyle one\n");
if (pre >= 1) {
double now = MPI_Wtime();
do_redblack(f,correction);
double later = MPI_Wtime();
redblack_time += (later - now);
redblack_count++;
//cout << "New correction: " << correction;
/* And apply the operator to find the defect */
apply_operator(correction,defect);
apply_op_count++;
apply_op_time += (MPI_Wtime() - later);
defect = -(defect - f);
u = correction;
/* Perform a second relaxation pass */
......@@ -1906,17 +1895,11 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
}
for (int i = 1; i < pre; i++) {
// if (master()) fprintf(stderr,"Doing RB precycle %d\n",i);
double now = MPI_Wtime();
do_redblack(defect,correction);
double later = MPI_Wtime();
redblack_time += (later - now);
redblack_count++;
// if (master()) fprintf(stderr,"RB precycle %d complete\n",i);
// if (master()) fprintf(stderr,"correction:\n");
// SYNC(cerr << correction; cerr.flush());
apply_operator(correction,f);
apply_op_count++;
apply_op_time += (MPI_Wtime() - later);
// if (master()) fprintf(stderr,"f:\n");
// SYNC(cerr << f; cerr.flush());
u = u+correction;
......@@ -1964,8 +1947,6 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
{
double now = MPI_Wtime();
apply_operator(correction,f);
apply_op_count++;
apply_op_time += (MPI_Wtime() - now);
}
defect = defect-f-coarse_extra_out;
u = u+correction;
......@@ -1998,8 +1979,6 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
{
double now = MPI_Wtime();
apply_operator(correction,f);
apply_op_count++;
apply_op_time += (MPI_Wtime() - now);
}
// SYNC(cerr << f; cerr.flush()); MPI_Finalize(); exit(1);
defect = defect-f-coarse_extra_out;
......@@ -2009,15 +1988,9 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
/* Now, two more relaxation steps */
// if (master()) fprintf(stderr,"F[%d] mid-cycle corrections\n",size_x);
for (int i = 0; i < mid; i++) {
double now = MPI_Wtime();
do_redblack(defect,correction);
// cout << "New correction: " << correction;
double later = MPI_Wtime();
redblack_count++;
redblack_time += (later - now);
apply_operator(correction,f);
apply_op_count++;
apply_op_time += (MPI_Wtime() - later);
defect = defect-f;
u = u+correction;
}
......@@ -2042,8 +2015,6 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
{
double now = MPI_Wtime();
apply_operator(correction,f);
apply_op_count++;
apply_op_time += (MPI_Wtime() - now);
}
defect = defect-f-coarse_extra_out;
u = u+correction;
......@@ -2062,15 +2033,9 @@ void MG_Solver::_cycle(CYCLE_TYPE cycle, Array<double,2> & f, Array<double,2> &
/* Now, two more relaxation steps */
for (int i = 0; i < post; i++) {
double now = MPI_Wtime();
do_redblack(defect,correction);
double later = MPI_Wtime();
redblack_count++;
redblack_time += (later - now);
// cout << "New correction: " << correction;
apply_operator(correction,f);
apply_op_count++;
apply_op_time += (MPI_Wtime() - later);
defect = defect-f;
u = u+correction;
}
......
#include "timing.hpp"
#include <string>
#include <vector>
#include <mpi.h>
#include <stdio.h>
// Structure for information related to a single timing invocation
struct timing_element;
struct timing_element {
std::string name; //name of the current element
int count; // number of times this element has been pushed
double tic_time; // MPI_Wtime when this was pushed to the stack
double total_time; // Total accumulated runtime so far
std::vector<timing_element *> children; // List of sub-elements pushed thus far, at any time
timing_element *parent; // Pointer to the parent element
};
std::vector<timing_element *> root(0); // Elements created at root level
timing_element *current = 0; // Pointer to current element
/* Add a new element to the timing stack. Re-use an existing element
of this name if one already exists, otherwise create a new structure
and append it to the current-children list */
void timing_push(const char * name){
//fprintf(stderr,"Pushing new element...\n");
std::vector<timing_element *> *child_list; // list to check for duplicates
if (current == 0) {
child_list = &root;
} else {
child_list = &current->children;
}
for (int i = 0; i < child_list->size(); i++) {
// fprintf(stderr,"Considering element %d, name %s\n",i,(*child_list)[i]->name.c_str());
if (!strcmp((*child_list)[i]->name.c_str(),name)) {
// We have a match, so move into this element
timing_element *child = (*child_list)[i];
child->tic_time = MPI_Wtime();
child->count += 1;
current = child;
return; // done
}
}
// no match found, so construct a new element
/*fprintf(stderr,"Constructing new element %s ",name);
if (current == 0) {
fprintf(stderr,"as a child of root\n");
} else {
fprintf(stderr,"as a child of element %s\n",current->name.c_str());
}*/
timing_element *new_el = new timing_element;
new_el->name = name;
new_el->tic_time = MPI_Wtime();
new_el->total_time = 0;
new_el->count = 1;
new_el->parent = current;
child_list->push_back(new_el);
current = new_el;
return;
}
void timing_pop(){
if (current == 0) {
fprintf(stderr,"ERROR: Popping at the top of the timing stack!\n");
return;
}
double now = MPI_Wtime();
//fprintf(stderr,"Popping timing element %s\n",current->name.c_str());
current->total_time += now - current->tic_time;
current = current->parent;
}
void print_element(timing_element * el, int indent) {
if (el->total_time < 1) { // output in ms
fprintf(stderr,"%*s%s -- %d -- %.2fms\n",indent,"",el->name.c_str(),el->count,el->total_time*1000);
} else { // output in seconds
fprintf(stderr,"%*s%s -- %d -- %.3fs\n",indent,"",el->name.c_str(),el->count,el->total_time);
}
}
void timing_stack_recurse(timing_element * el, int indent) {
print_element(el, indent);
for (int i = 0; i < el->children.size(); i++) {
timing_stack_recurse(el->children[i], indent + 1);
}
}
void timing_stack_report(){
for (int i = 0; i < root.size(); i++) {
timing_stack_recurse(root[i],0);
}
}
/* timing.hpp -- header file for self-timing code */
#ifndef TIMING_HPP
#define TIMING_HPP 1 // Include this header file at most once
/* The timing infrastructure works with a stack-based model.
At portions of the code to be instrumented, call timing_push("name")
with some (short but) memortable name for the code segment; after the
end of the section call timing_pop(). This underlying code will build
the corresponding call-time-tree and measure (via MPI_Wtime) the time
between the timing_push and the corresponding timing_pop.
This model does require that timing be called in a predictable way, such
that timing_pop is called before any possible exit from the code path.*/
void timing_push(const char * name);
void timing_pop();
void timing_stack_report();
#endif
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment