! Copyright (C) HPC WORLD (Prometech Software, Inc. and GDEP Solutions, Inc.) All rights reserved.
!
! Permission is hereby granted, free of charge, to any person obtaining a
! copy of this software and associated documentation files (the "Software"),
! to deal in the Software without restriction, including without limitation
! the rights to use, copy, modify, merge, publish, distribute, sublicense,
! and/or sell copies of the Software, and to permit persons to whom the
! Software is furnished to do so, subject to the following conditions:
!
! The above copyright notice and this permission notice shall be included in
! all copies or substantial portions of the Software.
!
! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
! DEALINGS IN THE SOFTWARE.
!
module precision_m
!   integer, parameter, public :: fp_kind = kind(0.0)                  ! Single precision
    integer, parameter, public :: fp_kind = selected_real_kind(15,305) ! Double precision
end module precision_m
module measure_time
    integer :: clock_start,clock_end,clock_rate
    character*40 time_tag
    contains
    integer function init_clock
      call system_clock(count_rate=clock_rate) ! Find the rate
      clock_start = 0; clock_end = 0
    end function
    integer function start_clock
      call system_clock(count=clock_start) ! Start timing
    end function
    integer function end_clock
      call system_clock(count=clock_end)   ! End timing
    end function
    integer function print_time(time_tag)
      character*40 time_tag
      print '(/1x,a,2x,f10.3,a/)', time_tag, &
        REAL(clock_end-clock_start,kind(0d0))/REAL(clock_rate,kind(0d0)), " (sec)"
    end function
end module measure_time

module CG_routines
    use precision_m
    implicit none
    integer                  :: nprint
    integer, parameter       :: idbg = 2
    integer, parameter       :: max_iter = 10000
    real(fp_kind), parameter :: tol = 1.0d-12
    integer, parameter       :: index_base = 1
    ! CSR format 
    integer  :: nz                           ! nonzero element of A (m x n ; n = m)
    integer, allocatable, dimension(:)       :: csrRowPtrA  ! m + 1
    integer, allocatable, dimension(:)       :: csrColIndA  !
    real(fp_kind), allocatable, dimension(:) :: csrValA
!
    real(fp_kind), allocatable, dimension(:) :: X
    real(fp_kind), allocatable, dimension(:) :: rhs

  contains
    subroutine tridaiag_matgen(n)
!-------------------------------------------------------------
!   genTridiag: generate a random tridiagonal symmetric matrix
!   A matrix : sparse CSR storage format
!   N x N  symmetric positive definite matrix
!-------------------------------------------------------------
    integer       :: n ! size of matrix A
    integer       :: i, js
    real(fp_kind) :: rnd, rand
    external rand


    nz = (n-2)*3 + 4    ! non-zero elements
    allocate( csrRowPtrA(n+1) )
    allocate( csrColIndA(nz)  )
    allocate( csrValA(nz)     )

!   matrix A
    call random_number(rnd)
    csrRowPtrA(1) = 1  !  i = 1
    csrColIndA(1) = 1 ; csrValA(1) = rnd + 10.0
    csrColIndA(2) = 2 ; csrValA(2) = rnd
    do i = 2, n
      if ( i > 2 ) then
        csrRowPtrA(i) = csrRowPtrA(i-1) + 3
      else
        csrRowPtrA(2) = 3
      end if

      js = (i-2)*3 + 2 + index_base
      csrColIndA(js)   = i - 1
      csrColIndA(js+1) = i
      csrColIndA(js+2) = i + 1

      call random_number(rnd)
      csrValA(js) =  csrValA(js-1)
      csrValA(js+1) = rnd + 10.0

      if ( i < n )  then
        call random_number(rnd)
        csrValA(js+2) = rnd
      end if
    end do
    csrRowPtrA(n+1) = nz + csrRowPtrA(1)

! allocate X, rhs
    allocate( X(n)   )
    allocate( rhs(n) )

! Initilize rhs
    rhs = 1.0

!   check the number of non-zero element
    print '(1x, a, 1x, i10, 1x , a, i10)', " Matrix size =", n, " x ", n
    print *, " Check a number of non-zero element"
    print *, " nz =", nz

    end subroutine tridaiag_matgen

    subroutine laplace_2nd_matgen(n)
!-------------------------------------------------------------
!   genTridiag: generate a tridiagonal symmetric matrix
!   A matrix : sparse CSR storage format
!   a second order, regular, Laplacian operator
!-------------------------------------------------------------
    integer       :: n ! size of matrix A
    integer       :: n_route
    integer       :: i, idx
    integer       :: ix, iy

    if ( n /= sqrt(real(n))*sqrt(real(n)) )  stop "N number"

    nz = 5*N-4*int(sqrt(real(n)))   ! non-zero elements
    allocate( csrRowPtrA(1:n+1) )
    allocate( csrColIndA(1:nz)  )
    allocate( csrValA(1:nz)     )

! allocate X, rhs
    allocate( rhs(1:n) ) ; rhs = 0.0 ! Initialize RHS
    n_route = sqrt(real(n))          ! laplace dimension

    idx = 1
!   matrix A
    do i = 1, n
      ix = mod(i-1,n_route)
      iy = (i-1)/n_route
      !print *,"ix,iy=",ix,iy

      csrRowPtrA(i) = idx

      ! up
      if ( iy > 0 ) then
        csrValA(idx) = 1.0
        csrColIndA(idx) = i - n_route
        idx = idx + 1
      else
        rhs(i) = rhs(i) - 1.0
      end if

      ! left
      if ( ix > 0 ) then
        csrValA(idx) = 1.0
        csrColIndA(idx) = i - 1
        idx = idx + 1
      else
        rhs(i) = rhs(i) - 0.0
      end if

      ! center
        csrValA(idx) = -4.0
        csrColIndA(idx) = i
        idx = idx + 1

      ! right
      if ( ix < n_route-1 ) then
        csrValA(idx) = 1.0
        csrColIndA(idx) = i + 1
        idx = idx + 1
      else
        rhs(i) = rhs(i) - 0.0
      end if

      ! down
      if ( iy < n_route-1 ) then
        csrValA(idx) = 1.0
        csrColIndA(idx) = i + n_route
        idx = idx + 1
      else
        rhs(i) = rhs(i) - 0.0
      end if

    end do
    csrRowPtrA(n+1) =  idx

!   check the number of non-zero element
    print '(1x, a, 1x, i10, 1x , a, i10)', " Matrix size =", n, " x ", n
    print *, " Check a number of non-zero element"
    print *, " idx =", idx, "nz =", nz

! allocate X, rhs
    allocate( X(n) )

    end subroutine laplace_2nd_matgen
!==============================================

    subroutine CG(n)
!----------------------------------------------
!   Conjugate gradient without preconditioning
!----------------------------------------------
    use precision_m
    use measure_time

    use cublas_v2
    use cusparse
    implicit none

    integer                  :: n ! size of matrix

    type(cusparseHandle)     :: cusparse_H
    type(cublasHandle)       :: cublas_H
!    type(cusparseMatDescr)   :: descrA
! Use Newer cuSPARSE API
    type(cusparseSpMatDescr)   :: descrA
    type(cusparseDnVecDescr)   :: vecP, vecQ, vecR, vecX
    integer, allocatable, dimension(:) :: bufferMV
    integer*8 :: bufferSizeMV

    integer                  :: cusparse_status
    integer                  :: cublas_status
    integer                  :: status !, ierr_code
    real(fp_kind)            :: r0, r1, alpha, alpham1, beta
    real(fp_kind)            :: dot, nalpha
    real(fp_kind)            :: error, rsum, diff
    real(fp_kind), allocatable, dimension(:) :: r
    real(fp_kind), allocatable, dimension(:) :: p
    real(fp_kind), allocatable, dimension(:) :: q
    real(fp_kind),parameter::  one = 1.0, zero = 0.0, onem1 = -1.0

    integer                :: i, j, k

    allocate( r(n), p(n) )
    allocate( q(n) )

! Conjugate gradient without preconditioning

    print *,"-------------------------------------------------------------"
    print *,"Conjugate gradient without preconditioning"
    print *,"-------------------------------------------------------------"

! Initilize X, rhs
    X = 0.0  ! or Initial guess X is set

! Copy rhs to r (Initial value r for rhs which is "b" vector)
    do  i = 1, n
       r(i) = rhs(i)  ! set r = rhs, where Ax = rhs (Ax = b)
    end do

! create handle for cuBLAS
    cublas_status = cublasCreate(cublas_H)

! create handle for CUSPARSE and matrix A descriptor
    cusparse_status = cusparseCreate(cusparse_H)
    if (cusparse_status /= CUSPARSE_STATUS_SUCCESS) write(*,*) 'cusparseCreate error: ', status
!! OLD API
!!    status = cusparseCreateMatDescr(descrA)

! matrix A properties
!!    status = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
!!    status = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)  ! base = 1

!$acc data copy(r, X) copyin(csrValA, csrRowPtrA, csrColIndA) &
!$acc create(p, q)

   !$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, p, q, X, r)

! Use Newer API
    cusparse_status = cusparseCreateDnVec(vecP, n, p, CUDA_R_64F)
    cusparse_status = cusparseCreateDnVec(vecQ, n, q, CUDA_R_64F)
    cusparse_status = cusparseCreateDnVec(vecR, n, r, CUDA_R_64F)
    cusparse_status = cusparseCreateDnVec(vecX, n, X, CUDA_R_64F)

    cusparse_status = cusparseCreateCsr(descrA, n, n, nz, csrRowPtrA, csrColIndA, csrValA, &
        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F) ! base 1

    alpha   = 1.0
    beta    = 0.0

    k = 0
    r0 = 0.0

       status = init_clock
       status = start_clock

    cusparse_status =  cusparseSpMV_bufferSize( &
            cusparse_H, CUSPARSE_OPERATION_NON_TRANSPOSE, &
            one, descrA, vecX, zero, vecQ, &
            CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, bufferSizeMV)

    allocate(bufferMV(bufferSizeMV/4))

    !$acc data create(bufferMV(1:bufferSizeMV/4))
    !$acc host_data use_device(bufferMV)

   ! === Initial guess x0 and residual r1 = r - Ax0
!   !$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, q, X, r)
!! OLD API
!!    cusparse_status = cusparseDcsrmv(cusparse_H,CUSPARSE_OPERATION_NON_TRANSPOSE, &
!!                      n, n, nz, one, descrA, csrValA, csrRowPtrA, csrColIndA, X, zero, q)  ! q = A x0
! Use Newer API
    cusparse_status = cusparseSpMV(cusparse_H,CUSPARSE_OPERATION_NON_TRANSPOSE, &
                      one, descrA, vecX, zero, vecQ, &
                      CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, bufferMV)  ! q = A x0

    cublas_status = cublasDaxpy(cublas_H, n, onem1, q, 1, r, 1)                            ! r = r - q
    cublas_status = cublasDaxpy(cublas_H, n, onem1, q, 1, r, 1)                            ! r = r - q
    cublas_status = cublasDdot (cublas_H, n, r, 1, r, 1, r1)                               ! r1= (r, r)

    do while ( r1 > tol*tol .and. k < max_iter )
      k = k + 1

      if ( k == 1 ) then

      !$acc host_data use_device(r,p)
        cublas_status = cublasDcopy(cublas_H, n, r, 1, p, 1)              ! p = r
      !$acc end host_data

      else

        beta = r1/r0;                                      ! beta = r(i) / r(i-1)
                                                           ! p = r + beta * p
      !$acc host_data use_device(r,p)
        cublas_status = cublasDscal(cublas_H, n, beta, p, 1)       ! p = beta*p
        cublas_status = cublasDaxpy(cublas_H, n, one, r, 1, p, 1)  ! p = r + p
      !$acc end host_data

      end if

      ! Compute q = A p
!! Old API
!!        cusparse_status = cusparseDcsrmv(cusparse_H,CUSPARSE_OPERATION_NON_TRANSPOSE, &
!!                          n, n, nz, one, descrA, csrValA, csrRowPtrA, csrColIndA, p, zero, q)
! Use Newer API
      cusparse_status = cusparseSpMV(cusparse_H,CUSPARSE_OPERATION_NON_TRANSPOSE, &
                      one, descrA, vecP, zero, vecQ, &
                      CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, bufferMV)  ! q = A x0
        cublas_status = cublasDdot(cublas_H, n, p, 1, q, 1, dot)   ! dot = p{T} * q

        alpha = r1/dot

      ! x = x + alpha*p
      !$acc host_data use_device(p, X)
        cublas_status = cublasDaxpy(cublas_H, n, alpha, p, 1, X, 1)
      !$acc end host_data

        nalpha = -alpha
      ! r = r - alpha*q
      !$acc host_data use_device(q, r)
        cublas_status = cublasDaxpy(cublas_H, n, nalpha, q, 1, r, 1)
      !$acc end host_data

        r0 = r1                                                   ! r0 = r1
      !$acc host_data use_device(r)
        cublas_status = cublasDdot(cublas_H, n, r, 1, r, 1, r1)   ! r1 = (r, r)
      !$acc end host_data


        if (idbg == 1 ) print *, "iteration=", k,  " residual =", sqrt(r1)
    end do
!$acc end host_data
!$acc end data

!$acc end host_data
!$acc end data

    status = end_clock

    print *, "iteration=", k,  " residual =", sqrt(r1)
    if ( sqrt(r1) < tol ) then
      print *, "CG without preconditioning test is passed"
    else
      print *, "CG without preconditioning test is failed"
    end if

        time_tag = "Elapased time CG without preconditioning : "
        status = print_time(time_tag)

! print *, "X=",X
! check result
    error = 0.0
    do i = 1, n
      rsum = 0.0
      do  j =  csrRowPtrA(i), csrRowPtrA(i+1)-1
         rsum = rsum + csrValA(j) * X(csrColIndA(j))
      end do
      diff = abs(rsum - rhs(i))
      if (diff > error) error = diff
    end do
    print *,"error =",error

! output result
    if (idbg == 2) then
      nprint = n
      if ( n > 10 ) nprint = 10
      do i = 1, nprint
         print '(1x, a,i7,a,e18.12)', "x(", i, ")=", X(i)
      end do
      print *,""
      do i = n-nprint, n
         print '(1x, a,i7,a,e18.12)', "x(", i, ")=", X(i)
      end do
    end if

    status = cublasDestroy(cublas_H)
    status = cusparseDestroy(cusparse_H)
    deallocate( q, r,  p )

    end subroutine CG
!======================================================

End module CG_routines

program main
  use precision_m
  use CG_routines
  implicit none

  integer                :: num_mat
  integer                :: i, j

!-------------------------------------------------------------
!   A matrix : sparse CSR type matrix
!   N x N  symmetric positive definite matrix (num_mat)
!-------------------------------------------------------------

  num_mat = 16
! num_mat = 16384
  num_mat = 1048576
! num_mat = 4000000

  print '(/1x,a,d15.10,2x,a,i8/)'," tol = ", tol, "max_iteration =", max_iter

! Select matrix data

! call tridaiag_matgen    (num_mat)
  call laplace_2nd_matgen (num_mat)

! Print CSR Matrix data
!    print *, "csrValA", csrValA
!    print *, "csrRowPtrA",csrRowPtrA
!    print *, "csrColIndA", csrColIndA

! Conjugate gradient without preconditioning

  call CG (num_mat)

! Preconditioned Conjugate Gradient using Incomplete-LU
! factorization with 0 fill-in and no pivoting

!!  call ICLU_CG (num_mat)

! Preconditioned Conjugate Gradient using Incomplete-Cholesky
! factorization

!!  call ICCL_CG (num_mat)

end program main