! gfortran -fopenmp-simd -Ofast -Wall mod_poly_eval.F90 main.F90 -o main.F90.exe
program simd
  use mod_poly_eval

  implicit none

  integer :: N, degree
  real(kind=dp), dimension(:), allocatable :: x, f, coefficients
  real(kind=dp) :: sum_ref, sum_simd, duration_s, t_start
  integer :: n_repetitions, i, r, n_seed
  integer, allocatable :: seed(:)

  integer             :: argc, iostat
  character(len=1024) :: arg

  N = 1000
  degree = 10

  ! argument processing
  argc = command_argument_count()

  ! do i = 1, argc
  !   call get_command_argument(i, arg)
  !   write(*,*) rank, i, trim(arg)
  ! end do

  if (argc > 0) then
    call get_command_argument(1, arg)
    read(arg, *, iostat=iostat) N
  endif

  if (argc > 1) then
    call get_command_argument(2, arg)
    read(arg, *, iostat=iostat) degree
  endif

  allocate(coefficients(degree + 1))
  allocate(x(N))
  allocate(f(N))


  call random_seed(size=n_seed)
  allocate(seed(n))
  seed(:) = 2
  call random_seed(put=seed)

  call random_number(coefficients(:))
  coefficients(:) = coefficients(:) * 10.0_dp

  seed(:) = 1
  call random_seed(put=seed)
  deallocate(seed)

  call random_number(x(:))
  x(:) = x(:) / 100.0_dp + 1.0_dp
  f(:) = 0.0_dp

  sum_ref = 0.0_dp
  do i = 1, N
    sum_ref = sum_ref + poly_eval_ref(x(i), degree, coefficients(:))
  end do

  n_repetitions = 2
  duration_s = 0.0_dp

  do while (duration_s < 0.2_dp .or. n_repetitions == 4)
    t_start = get_time()

    do r = 1, n_repetitions
      ! main loop
      !$omp simd
      do i = 1, N
        f(i) = poly_eval(x(i), degree, coefficients(:))
      end do

      if (f(N/2) < 0.0_dp) then
        write(*,'(a)') 'ERROR: should never happen'
        stop
      end if
    end do

    duration_s = get_time() - t_start
    n_repetitions = n_repetitions * 2
  end do

  n_repetitions = n_repetitions / 2

  sum_simd = 0.0_dp
  do i = 1, N
    sum_simd = sum_simd + f(i)
  end do

  write(*,'(a,1p e19.12)') 'reference sum:  ', sum_ref
  write(*,'(a,1p e19.12)') 'simd sum:       ', sum_simd
  write(*,'(a,1p e19.12)') 'relative error: ', (sum_ref - sum_simd) / sum_ref

  write(*,'(a,i0,a,i0,a,i0,a,f7.2,a,f7.2,a)') &
             'N: ', N, '  degree: ', degree, '  repetitions: ', n_repetitions,     &
             '  performance: ', real(n_repetitions, kind=dp) *                     &
             real(N, kind=dp) * 2.0_dp * real(degree, kind=dp) / duration_s / 1e6, &
             ' MFLOP/s  duration: ', duration_s, ' s'


  deallocate(coefficients)
  deallocate(x)
  deallocate(f)

contains

  real(kind=dp) function poly_eval_ref(x, degree, coefficients)
    integer, intent(in) :: degree
    real(kind=dp), intent(in) :: x
    real(kind=dp), intent(in) :: coefficients(0:degree)

    integer :: i
    poly_eval_ref = coefficients(0)
    do i = 1, degree
      poly_eval_ref = x * poly_eval_ref + coefficients(i)
    end do
  end function

  real(kind=dp) function get_time()
    implicit none
    integer :: c, rate, m

    call system_clock(c, rate, m)
    get_time = dble(c) / dble(rate)
  end function

end program
