/* * ======================================================================================= * * Author: Jan Eitzinger (je), jan.eitzinger@fau.de * Copyright (c) 2019 RRZE, University Erlangen-Nuremberg * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * ======================================================================================= */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #define ARRAY_ALIGNMENT 64 #define HLINE "----------------------------------------------------------------------------\n" #ifdef TIMING #define T(TYPE) timer[TYPE].val += #define TIMER_START S=getTimeStamp() #define TIMER_STOP E=getTimeStamp() #else #define T(TYPE) #define TIMER_START #define TIMER_STOP #endif #define KNRM "\x1B[0m" #define KRED "\x1B[31m" #define KGRN "\x1B[32m" #define CHECK_LESS_THAN(a, b, OP)\ if(a >= b) {\ printf("%sTest %s failed\n", KRED, OP);\ printf("%s",KNRM);\ } else {\ printf("%sTest %s success\n",KGRN, OP);\ printf("%s",KNRM);\ } typedef enum component { AXPBY = 0, DOT_PRODUCT, APPLY_STENCIL, CG_SOLVER, NUMTIMERS } component; typedef struct { char* label; double val; } timerType; typedef struct { double* ptr; int size_x; //number of grid points in x dimension int size_y; //number of grid points in y dimension } gridType; typedef struct { double h_x; double h_y; int niter; double tolerance; } solverType; /********************/ /* Helper functions */ /********************/ double getTimeStamp() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9; } void allocGrid( int size_x, int size_y, size_t bytesPerWord, gridType* grid) { posix_memalign((void**) &(grid->ptr), ARRAY_ALIGNMENT, size_x * size_y * bytesPerWord ); grid->size_x = size_x; grid->size_y = size_y; } void copyGrid( gridType* src, gridType* dst) { int size = (src->size_x * src->size_y); double* S = src->ptr; double* D = dst->ptr; for ( int i=0; iptr); } void init(gridType* x, gridType* u_sine, gridType* rhs_sine, solverType* solver) { int size = (x->size_x * x->size_y); int size_x = x->size_x; int size_y = x->size_y; double* P = x->ptr; unsigned int seed = 0; // initialize x #pragma omp parallel firstprivate(seed) { seed += omp_get_thread_num(); #pragma omp for for ( int i=0; iptr; // initialize u_sine #pragma omp parallel for for(int j=0; jh_x)* sin(M_PI * j * solver->h_y); } } P = rhs_sine->ptr; // initialize rhs_sine #pragma omp parallel for for(int j=0; jh_x)* sin(M_PI * j * solver->h_y); } } } /********************/ /* Basic operations */ /********************/ /* acpby - Calculates res[:] = a*x[:] + b*y[:] */ double axpby( gridType* res, double a, gridType* x, double b, gridType* y) { double S=0.0, E=0.0; double* R = res->ptr; double* X = x->ptr; double* Y = y->ptr; int size = x->size_x * x->size_y; TIMER_START; #pragma omp parallel { LIKWID_MARKER_START("axpby"); #pragma omp for for(int i=0; isize_x * x->size_y; double* X = x->ptr; double* Y = y->ptr; TIMER_START; #pragma omp parallel for reduction(+:l2_sq) for(int i=0; iptr; double* U = u->ptr; int size_x = res->size_x; int size_y = res->size_y; double w_x = 1.0 / (solver->h_x * solver->h_x); double w_y = 1.0 / (solver->h_y * solver->h_y); double w_c = 2.0 * (w_x + w_y); TIMER_START; #pragma omp parallel for for ( int j=1; jtolerance * solver->tolerance; allocGrid(x->size_x, x->size_y, bytesPerWord, &p); allocGrid(x->size_x, x->size_y, bytesPerWord, &v); allocGrid(x->size_x, x->size_y, bytesPerWord, &r); computeResidual(&p, x, b, solver, &alpha_0); allocGrid(x->size_x, x->size_y, bytesPerWord, &r); copyGrid(&p, &r); S = getTimeStamp(); while( (iter < solver->niter) && (alpha_0 > tolSquared) ) { LIKWID_MARKER_START("cg"); T(APPLY_STENCIL) applyStencil(&v, &p, solver); T(DOT_PRODUCT) dotProduct(&v, &p, &lambda); lambda = alpha_0/lambda; //Update x T(AXPBY) axpby(x, 1.0, x, lambda, &p); //Update r T(AXPBY) axpby(&r, 1.0, &r, -lambda, &v); T(DOT_PRODUCT) dotProduct(&r, &r, &alpha_1); //Update p T(AXPBY) axpby(&p, 1.0, &r, alpha_1/alpha_0, &p); alpha_0 = alpha_1; printf("iter = %d, res = %.15e\n", iter, alpha_0); ++iter; LIKWID_MARKER_STOP("cg"); } E = getTimeStamp(); freeGrid(&p); freeGrid(&v); freeGrid(&r); (*iter_end) = iter; return E-S; } int main (int argc, char** argv) { size_t bytesPerWord = sizeof(double); int size_x = 0; int size_y = 0; double res_start, err_start; double res_sine_cg, err_sine_cg; int itermax = 50; int iter_sine_cg; gridType u_sine, rhs_sine, x, residual; timerType timer[NUMTIMERS] = { {"axpby ", 0.0}, {"dot_product ", 0.0}, {"apply_stencil ", 0.0}, {"cg ", 0.0} }; if ( argc > 2 ) { size_x = atoi(argv[1]); size_y = atoi(argv[2]); if ( argc == 4 ) { itermax = atoi(argv[3]); } } else { printf("Usage: %s \n", argv[0]); exit(EXIT_SUCCESS); } solverType solver = {(double)1.0/(size_x-1.0), (double)1.0/(size_y-1.0), itermax, 1e-8}; allocGrid(size_x, size_y, bytesPerWord, &u_sine); allocGrid(size_x, size_y, bytesPerWord, &rhs_sine); allocGrid(size_x, size_y, bytesPerWord, &x); allocGrid(size_x, size_y, bytesPerWord, &residual); LIKWID_MARKER_INIT; #pragma omp parallel { LIKWID_MARKER_REGISTER("cg"); } init(&x, &u_sine, &rhs_sine, &solver); /* compute start error and residual */ axpby(&residual, 1.0, &u_sine, -1.0, &x); dotProduct(&residual, &residual, &err_start); computeResidual(&residual, &x, &rhs_sine, &solver, &res_start); timer[CG_SOLVER].val = CG( &x, &rhs_sine, &solver, &iter_sine_cg, timer); printf("CG iterations = %d\n", iter_sine_cg); printf("Performance CG = %f [MLUP/s]\n", (double)(iter_sine_cg)*size_x*size_y* 1e-6/timer[CG_SOLVER].val); /* compute end error and residual */ axpby(&residual, 1.0, &u_sine, -1.0, &x); dotProduct(&residual, &residual, &err_sine_cg); computeResidual(&residual, &x, &rhs_sine, &solver, &res_sine_cg); CHECK_LESS_THAN(res_sine_cg,res_start,"Solver::CG - residual check") CHECK_LESS_THAN(err_sine_cg,err_start,"Solver::CG - error check") #ifdef TIMING printf(HLINE); for (int i=0; i