#include <mpi.h>
#include <iostream>
#include <cmath>
#ifdef _OPENMP
#include <omp.h>
#endif
extern "C" {
#include "timing.h"
#include "dummy.h"
}

#define ME(r) cerr << (r) << ":   " 

using namespace std;

inline int vector_size(int n){
  return floor(pow(1.2,n));
}

int size_of_rank(int rank, int nump, int size) {
  return size/nump + ((size%nump>rank) ? 1 : 0);
}

double do_mat_vec(const double *a, double *x, double *y, int size, int my_size, int niter){
  double wct_start,wct_end,cput_start,cput_end;
  int rank,ranks, l_neighbor, r_neighbor;
  MPI_Status status;
  MPI_Comm_size(MPI_COMM_WORLD, &ranks);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  int num = size / ranks;
  int rest = size % ranks;
  l_neighbor = (rank + 1) % ranks;
  if((r_neighbor = (rank - 1) % ranks) < 0) r_neighbor = ranks-1;

  // ME(rank) << "Doing " << niter << "MVMs" << endl;

  timing(&wct_start, &cput_start);
  
#pragma omp parallel
  {
    for(int j=0; j<niter; j++){
      int n_start=rank*my_size+min(rest,rank), cur_size=my_size;
      // loop over RHS ring shifts
      for(int rot=0; rot<ranks; rot++) {
	// ME(rank) << "MVM: n_start=" << n_start << ", len=" << cur_size << endl;
#pragma omp for 
	for(int m=0; m<my_size; m++){
	  for(int n=n_start; n<n_start+cur_size; n++){
	    y[m]+=a[m*size+n]*x[n-n_start];
	  }
	}
	if(y[my_size>>1]<0){
	  cout << y[my_size>>1];
	}
	// ME(rank) << "local MVM done" << endl;
	n_start += cur_size;
	if(n_start>=size) {
	  n_start=0; // wrap around
	  // ME(rank) << "Wrapped" << endl;
	}
	cur_size = size_of_rank(l_neighbor,ranks,size);
	// ring shift: always transfer same size even if last element is
	//             invalid
#ifdef CHECK
	if(1/*rot!=ranks-1*/) {
#else
	if(rot!=ranks-1) {
#endif
#pragma omp single
	  {
	    // ME(rank) << "Doing Sendrecv from " << l_neighbor << " to " << r_neighbor << endl;
	    MPI_Sendrecv_replace(x, num+(rest?1:0), MPI_DOUBLE, r_neighbor, 0, 
				 l_neighbor, 0, MPI_COMM_WORLD, &status);
	    // ME(rank) << "Sendrecv done" << endl;
	  }
	} 
      }

#ifdef CHECK
#pragma omp single
	{
	  double lsum=0.0;
	  for(int i=0; i<my_size; ++i) {
	    lsum += y[i];
	    y[i]=0.0;
	  }
	      if(rank==0) {
		MPI_Reduce(MPI_IN_PLACE, &lsum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
		cerr << "Sum: " << lsum << endl;
	      }
	      else
		MPI_Reduce(&lsum, NULL, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	}
#endif


    }
  }
  timing(&wct_end, &cput_end);
  // ME(rank) << "do_mat_vec done" << endl;
  return wct_end-wct_start;
}

double double_mat_vec(const int size, const int my_size, int niter){
  static int old=0;
  static double *a=0,*x=0,*y=0;
  int rank,ranks;

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &ranks);
  int rest = size % ranks;
  if(old!=size) {
    // ME(rank) << "Reallocating..." << endl;
    delete [] a;
    delete [] x;
    delete [] y;
    a= new double[size*my_size];
    x= new double[my_size+1];
    y= new double[my_size+1];
    old=size;
  }
  // ME(rank) << "Initializing..." << endl;
  int n_start = rank*my_size+min(rest,rank);
#pragma omp parallel for
  for(int i=0; i<my_size; i++){
    x[i]=n_start+i; //4.0;
    y[i]=0;
    for(int j=0; j<size; j++){
      a[i*size+j]=j+n_start+i;
    }
  }

  // ME(rank) << "Calling do_mat_vec" << endl;
  double walltime = do_mat_vec(a,x,y,size,my_size,niter);

  return walltime;
}


int main(int argc, char** argv) {
  int rank,nump;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &nump);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  if(rank==0) {
    if(argc != 3) {
      cerr << "Usage: " << argv[0] << " <low> <high>" << endl;
      return 1;
    }
  }
  int low = atoi(argv[1]);
  int high = atoi(argv[2]);
  for(int n=low; n<=high; n*=1.1){  // dimension
    int my_size = size_of_rank(rank, nump, n);

    const int size = n; //vector_size(n);
    int niter = 1;
    double_mat_vec(size,my_size,niter);
    // determine NITER for each N individually
    double wct_used = double_mat_vec(size,my_size,niter);
    while(wct_used < 0.1){
      niter = niter * 2;
      wct_used = double_mat_vec(size,my_size,niter);
    }

    double walltime = double_mat_vec(size,my_size,niter);
    double operations = 2.0 * size * size * niter;
    double performance = operations / 1000000 / walltime;
//    cout << "Size: " << size << endl;
//    cout << "Iterations: " << niter << endl;
//    cout << "Walltime [s]: " << walltime << endl;
//    cout << "Performance [MFlop/s]: " << performance << endl;
    if(rank==0) {
      MPI_Reduce(MPI_IN_PLACE, &performance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
      cout << size << " s;p " << performance << endl;
    }
    else
      MPI_Reduce(&performance, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);

  }
  MPI_Finalize();
  return 0;
}
