#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <omp.h>

typedef double FLT_T;

static void
axpy(long n_el, FLT_T a, register FLT_T * x, register FLT_T * y)
{
    #pragma omp parallel for schedule(static)
    for (long i = 0; i < n_el; ++i) {
        y[i] = a * x[i] + y[i];
    }
}


int main(int argc, char * argv[])
{
    long n_el = 100 * 1024 * 1024 / sizeof(FLT_T);

    /* first argument (if any) specifies vector size in bytes */
    if (argc > 1) {
        n_el = atol(argv[1]) / sizeof(FLT_T);
    }

    long n_its = 30;
    /* second argument (if any) specifies the no. of iterations */
    if (argc > 2) {
        n_its = atol(argv[2]);
    }

    FLT_T a = 0.0001;
    /* third argument (if any) specifies the value of the scalar for axpy */
    if (argc > 3) {
        a = atof(argv[3]);
    }

    FLT_T * x = malloc(n_el * sizeof(FLT_T));
    if (x == NULL) {
        perror("malloc"); exit(EXIT_FAILURE);
    }

    FLT_T * y = malloc(n_el * sizeof(FLT_T));
    if (y == NULL) {
        perror("malloc"); exit(EXIT_FAILURE);
    }

    for (long i = 0; i < n_el; ++i) {
        x[i] = i;
        y[i] = n_el - i;
    }

    // warmup
    axpy(n_el, a, x, y);

    double t_start = omp_get_wtime();

    for (long i = 0; i < n_its; ++i) {
        double t_inner_start = omp_get_wtime();

        axpy(n_el, a, x, y);
        __asm__ volatile ("" : : "r"(y) : "memory");

        double t_inner_end = omp_get_wtime();
        double duration_inner = t_inner_end - t_inner_start;
        double bw_GB_s = 3.0 * sizeof(FLT_T) * (double)n_el / (duration_inner) * 1e-9;

        printf("%5ld %f s   %e GB/s\n", i, duration_inner, bw_GB_s);
    }

    double duration = omp_get_wtime() - t_start;

    const double bytes_MB = 2.0 * (double)n_el * sizeof(FLT_T) / 1e6;
    const double perf_GFLOPS = 2.0 * (double)n_el * (double)n_its / duration / 1e9;
    printf("%-10s %e MB  %e GB/s  %e GFLOP/s  duration: %e s  nits: %ld\n",
           "axpy", bytes_MB, 3.0 * sizeof(FLT_T) / 2.0 * perf_GFLOPS,
           perf_GFLOPS, duration, n_its);

    free(x);
    free(y);

    return EXIT_SUCCESS;
}
