! gfortran -fopenmp -Wall ray-tracer.offload.F90 -o ray-tracer.offload.F90.exe
module mod_raytracef
  implicit none

  private

  public :: is, intersect, shade, calc_tile, timing, dp
  public :: compute_difference

  integer, parameter :: is = selected_int_kind(2)
  integer, parameter :: dp = kind(1.d0)

  real(kind=dp), parameter :: objs(4, 11) = reshape( (/ &
         0.d0 ,0.d0, -100.5d0, 10000.d0 , &
         0.d0, 0.d0, 0.d0, 0.25d0 , &
         0.272166d0, 0.272166d0, 0.544331d0, .027777d0  , &
         0.643951d0, 0.172546d0, 0.d0, .027777d0 , &
         0.172546d0, 0.643951d0, 0.d0, .027777d0 , &
        -0.371785d0, 0.099620d0, 0.544331d0, .027777d0 , &
        -0.471405d0, 0.471405d0, 0.d0, .027777d0 , &
        -0.643951d0,-0.172546d0, 0.d0, .027777d0 , &
         0.099620d0,-0.371785d0, 0.544331d0, .027777d0 , &
        -0.172546d0,-0.643951d0, 0.d0, .027777d0 , &
         0.471405d0,-0.471405d0, 0.d0, .027777d0  /), (/ 4, 11 /) )

  real(kind=dp), parameter :: objs_shade(3, 3) = reshape( (/ &
         4.d0, 3.d0, 2.d0 , &
         1.d0,-4.d0, 4.d0 , &
        -3.d0, 1.d0, 5.d0  /), (/3, 3/) )

  real(kind=dp) :: small = 1.0d-6
  !$omp declare target(objs, objs_shade, small)


contains

  real(kind=dp) function timing()
    integer :: c, rate, m
    call system_clock(c, rate, m)
    timing = real(c, kind=dp) / real(rate, kind=dp)
  end function


  integer function intersect(x, y, z, dx, dy, dz, pmax)
    !$omp declare target
    real(kind=dp), intent(in) :: x, y, z, dx, dy, dz
    real(kind=dp), intent(inout) :: pmax
!
    integer :: i
    real(kind=dp) :: xx, yy, zz, b, t, pmaxloc
!
    intersect = 0
    pmaxloc = pmax
    do i=1, 11
       xx = objs(1,i) - x ; yy = objs(2,i) - y; zz = objs(3,i) - z
       b = xx * dx + yy * dy + zz * dz
       t = b * b - xx * xx - yy * yy - zz * zz + objs(4,i)
       if (t < 0) then
          cycle
       else
          t = b - sqrt(t)
          if (t < small .or. t > pmaxloc) then
             cycle
          end if
       end if
       intersect = i
       pmaxloc = t
    end do
    pmax = pmaxloc
  end function intersect


  function shade(x, y, z, dx, dy, dz, id) result(res)
    !$omp declare target
    real(kind=dp) :: res
    real(kind=dp), value :: x, y, z
    real(kind=dp), value :: dx, dy, dz
    integer, intent(in) :: id
!
    real(kind=dp) :: pmax, r, k
    real(kind=dp) :: nx, ny, nz, ldx, ldy, ldz, rdx, rdy, rdz
    real(kind=dp) :: c(0:9), final_c
    integer :: i, de
    de = id
!
    do while (de < 10)
       c(de) = 0.0d0
       pmax = 1.0d6

       i = intersect(x, y, z, dx, dy, dz, pmax)

       if (i == 0) then
          de = de + 1
          exit
       end if

       x = x + pmax * dx; y = y + pmax * dy; z = z + pmax * dz
       nx = x - objs(1,i); ny = y - objs(2,i); nz = z - objs(3,i)
       r = sqrt(nx * nx + ny * ny + nz * nz)
       nx = nx/r; ny = ny/r; nz = nz/r
       k = nx * dx + ny * dy + nz * dz
       rdx = dx - 2 * k * nx; rdy = dy - 2 * k * ny; rdz = dz - 2 * k * nz
       do i = 1, 3
          ldx = objs_shade(1,i) - x
          ldy = objs_shade(2,i) - y
          ldz = objs_shade(3,i) - z
          r = sqrt(ldx * ldx + ldy * ldy + ldz * ldz);
          ldx = ldx/r; ldy = ldy/r; ldz = ldz/r;
          if (intersect(x, y, z, ldx, ldy, ldz, r) /= 0) cycle
          r = ldx * nx + ldy * ny + ldz * nz
          if (r < 0.0d0) cycle
          c(de) = c(de) + r
          r = rdx * ldx + rdy * ldy + rdz * ldz
          if (r > 0.0d0) then
             c(de) = c(de) + 2.0d0 * r**15
          end if
       end do

       dx = rdx; dy = rdy; dz = rdz;
       de = de + 1
    end do

    final_c = 0.0d0

    do i = de - 1, 1, -1
       final_c = 0.5 * (final_c + c(i))
    end do

    if (de >= 0) then
       final_c = final_c + c(0)
    end if

    res = final_c
  end function shade

  subroutine calc_tile(size, ixstart, iystart, tilesize, tile)
    integer, intent(in) :: size, ixstart, iystart, tilesize
    integer(kind=is), intent(out) :: tile(size,*)
!
    real(kind=dp) :: dx, dy, dz, c, r, xx, yy
    integer :: ix, iy

    do iy=1, tilesize
       do ix=1, tilesize
          xx = (ixstart + ix - 1) / dble(size - 1)
          yy = 1.0d0 - (iystart + iy - 1) / dble(size - 1)
          dx = -0.847569d0 - xx * 1.30741d0 - yy * 1.19745d0
          dy = -1.98535d0  + xx * 2.11197d0 - yy * 0.741279d0
          dz = -2.72303d0                   + yy * 2.04606d0;
          r = sqrt(dx * dx + dy * dy + dz * dz)
          c = 100.d0 * shade(2.1d0, 1.3d0, 1.7d0, dx/r, dy/r, dz/r, 0);

          c = max(c, 0.0d0)
          c = min(c, 255.1d0)

          tile(ix, iy) = int(c, 1_is)
       end do
    end do
  end subroutine calc_tile

  ! do not change, used for computing reference solution
  subroutine calc_tile_ref(size, ixstart, iystart, tilesize, tile)
    integer, intent(in) :: size, ixstart, iystart, tilesize
    integer(kind=is), intent(out) :: tile(size,*)
  !
    real(kind=dp) :: dx, dy, dz, c, r, xx, yy
    integer :: ix, iy
    integer(kind=is) :: consec_tile(tilesize, tilesize)

    consec_tile(:,:) = tile(1:tilesize,1:tilesize)

    !$omp target teams distribute parallel do simd collapse(2) &
    !$omp&       firstprivate(tilesize, size, ixstart, iystart) &
    !$omp&       private(r, c, xx, yy, dx, dy, dz) &
    !$omp&       map(tofrom:consec_tile) num_teams(1) thread_limit(1)
    do iy=1, tilesize
       do ix=1, tilesize
          xx = (ixstart + ix - 1) / dble(size - 1)
          yy = 1.0d0 - (iystart + iy - 1) / dble(size - 1)
          dx = -0.847569d0 - xx * 1.30741d0 - yy * 1.19745d0
          dy = -1.98535d0  + xx * 2.11197d0 - yy * 0.741279d0
          dz = -2.72303d0                   + yy * 2.04606d0;
          r = sqrt(dx * dx + dy * dy + dz * dz)
          c = 100.d0 * shade(2.1d0, 1.3d0, 1.7d0, dx/r, dy/r, dz/r, 0);

          c = max(c, 0.0d0)
          c = min(c, 255.1d0)

          consec_tile(ix, iy) = c ! int(c, 1_is)
       end do
    end do

    tile(1:tilesize,1:tilesize) = consec_tile(:,:)
  end subroutine

  ! do not change, used for computing reference solution
  function compute_difference(picture, size) result(res)
    real(kind=dp) :: res
    integer, intent(in) :: size
    integer(kind=is), intent(in) :: picture(size, size)
!
    integer(kind=is), allocatable :: ref(:, :)
    real(kind=dp) :: sos, d
    integer :: i, j
    ! integer :: iostat, fd

    allocate(ref(size, size))
    ref(:,:) = 0_is

    call calc_tile_ref(size, 0, 0, size, ref)

    sos = 0.0_dp
    do i = 1, size
       do j = i, size
          d = (picture(i, j) - ref(i, j)) / 256.0_dp
          sos = sos + d * d
       end do
    end do

    ! fd = 30
    ! open(unit=fd,            &
    !      file='ref.pnm',     &
    !      access='STREAM',    &
    !      form='FORMATTED',   &
    !      status='replace',   &
    !      action='write',     &
    !      iostat=iostat)

    ! if (iostat /= 0) then
    !   write(*,'(a)') 'ERROR: could not open file result.pnm.'
    !   stop 1
    ! end if

    ! write(fd, fmt='(''P5'',/,i0,1x,i0,/,i0)') size, size, 255

    ! do i = 1, size
    !    do j = 1, size
    !      write(fd, fmt='(a1)', advance="no") ref(j, i)
    !    end do
    ! end do

    ! close(fd)

    deallocate(ref)

    res = sqrt(sos)
  end function

end module mod_raytracef


program raytracef
  use mod_raytracef
  implicit none

  integer, parameter :: size = 6000
  integer, parameter :: tilesize = 1000

  integer :: xtiles, ytiles, xc, yc, tile_count
  integer(kind=is), allocatable :: picture(:,:)
  real(kind=dp) :: time_start, duration, difference
  integer :: fd, iostat

  allocate(picture(size, size))

  xtiles = size / tilesize
  ytiles = size / tilesize
  tile_count = 0

  time_start = timing()

  do yc = 0, ytiles - 1
     do xc = 0, xtiles - 1
        call calc_tile(size, &
                       xc * tilesize, yc * tilesize, &
                       tilesize, &
                       picture(xc * tilesize + 1, yc * tilesize + 1))
        tile_count = tile_count + 1
     end do
  end do

  duration = timing() - time_start

  difference = compute_difference(picture, size)

  write(*,'(a,i0,a,i0,a,f9.5,a,f9.5,a,i0,a,e12.7)') &
    'Size: ', size, ', Tile Size: ', tilesize, ', Time: ', duration,  &
    ' s, Performance: ', size * size / duration / 1.d6,               &
    ' MPixels/s, Tiles: ', tile_count, ', Difference: ', difference

  fd = 20
  open(unit=fd,            &
       file='result.pnm',  &
       access='STREAM',    &
       form='FORMATTED',   &
       status='replace',   &
       action='write',     &
       iostat=iostat)

  if (iostat /= 0) then
    write(*,'(a)') 'ERROR: could not open file result.pnm.'
    stop 1
  end if

  write(fd, fmt='(''P5'',/,i0,1x,i0,/,i0)') size, size, 255

  do yc = 1, size
     do xc = 1, size
       write(fd, fmt='(a1)', advance="no") picture(xc, yc)
     end do
  end do

  close(fd)

  deallocate(picture)

end program raytracef
