! Copyright (C) HPC WORLD (Prometech Software, Inc. and GDEP Solutions, Inc.) All rights reserved. ! ! Permission is hereby granted, free of charge, to any person obtaining a ! copy of this software and associated documentation files (the "Software"), ! to deal in the Software without restriction, including without limitation ! the rights to use, copy, modify, merge, publish, distribute, sublicense, ! and/or sell copies of the Software, and to permit persons to whom the ! Software is furnished to do so, subject to the following conditions: ! ! The above copyright notice and this permission notice shall be included in ! all copies or substantial portions of the Software. ! ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ! DEALINGS IN THE SOFTWARE. ! module precision_m ! integer, parameter, public :: fp_kind = kind(0.0) ! Single precision integer, parameter, public :: fp_kind = selected_real_kind(15,305) ! Double precision end module precision_m module measure_time integer :: clock_start,clock_end,clock_rate character*40 time_tag contains integer function init_clock call system_clock(count_rate=clock_rate) ! Find the rate clock_start = 0; clock_end = 0 end function integer function start_clock call system_clock(count=clock_start) ! Start timing end function integer function end_clock call system_clock(count=clock_end) ! End timing end function integer function print_time(time_tag) character*40 time_tag print '(/1x,a,2x,f10.3,a/)', time_tag, & REAL(clock_end-clock_start,kind(0d0))/REAL(clock_rate,kind(0d0)), " (sec)" end function end module measure_time module CG_routines use precision_m implicit none integer :: nprint integer, parameter :: idbg = 2 integer, parameter :: max_iter = 10000 real(fp_kind), parameter :: tol = 1.0d-12 integer, parameter :: index_base = 1 ! CSR format integer :: nz ! nonzero element of A (m x n ; n = m) integer, allocatable, dimension(:) :: csrRowPtrA ! m + 1 integer, allocatable, dimension(:) :: csrColIndA ! real(fp_kind), allocatable, dimension(:) :: csrValA ! real(fp_kind), allocatable, dimension(:) :: X real(fp_kind), allocatable, dimension(:) :: rhs contains subroutine tridaiag_matgen(n) !------------------------------------------------------------- ! genTridiag: generate a random tridiagonal symmetric matrix ! A matrix : sparse CSR storage format ! N x N symmetric positive definite matrix !------------------------------------------------------------- integer :: n ! size of matrix A integer :: i, js real(fp_kind) :: rnd, rand external rand nz = (n-2)*3 + 4 ! non-zero elements allocate( csrRowPtrA(n+1) ) allocate( csrColIndA(nz) ) allocate( csrValA(nz) ) ! matrix A call random_number(rnd) csrRowPtrA(1) = 1 ! i = 1 csrColIndA(1) = 1 ; csrValA(1) = rnd + 10.0 csrColIndA(2) = 2 ; csrValA(2) = rnd do i = 2, n if ( i > 2 ) then csrRowPtrA(i) = csrRowPtrA(i-1) + 3 else csrRowPtrA(2) = 3 end if js = (i-2)*3 + 2 + index_base csrColIndA(js) = i - 1 csrColIndA(js+1) = i csrColIndA(js+2) = i + 1 call random_number(rnd) csrValA(js) = csrValA(js-1) csrValA(js+1) = rnd + 10.0 if ( i < n ) then call random_number(rnd) csrValA(js+2) = rnd end if end do csrRowPtrA(n+1) = nz + csrRowPtrA(1) ! allocate X, rhs allocate( X(n) ) allocate( rhs(n) ) ! Initilize rhs rhs = 1.0 ! check the number of non-zero element print '(1x, a, 1x, i10, 1x , a, i10)', " Matrix size =", n, " x ", n print *, " Check a number of non-zero element" print *, " nz =", nz end subroutine tridaiag_matgen subroutine laplace_2nd_matgen(n) !------------------------------------------------------------- ! genTridiag: generate a tridiagonal symmetric matrix ! A matrix : sparse CSR storage format ! a second order, regular, Laplacian operator !------------------------------------------------------------- integer :: n ! size of matrix A integer :: n_route integer :: i, idx integer :: ix, iy if ( n /= sqrt(real(n))*sqrt(real(n)) ) stop "N number" nz = 5*N-4*int(sqrt(real(n))) ! non-zero elements allocate( csrRowPtrA(1:n+1) ) allocate( csrColIndA(1:nz) ) allocate( csrValA(1:nz) ) ! allocate X, rhs allocate( rhs(1:n) ) ; rhs = 0.0 ! Initialize RHS n_route = sqrt(real(n)) ! laplace dimension idx = 1 ! matrix A do i = 1, n ix = mod(i-1,n_route) iy = (i-1)/n_route !print *,"ix,iy=",ix,iy csrRowPtrA(i) = idx ! up if ( iy > 0 ) then csrValA(idx) = 1.0 csrColIndA(idx) = i - n_route idx = idx + 1 else rhs(i) = rhs(i) - 1.0 end if ! left if ( ix > 0 ) then csrValA(idx) = 1.0 csrColIndA(idx) = i - 1 idx = idx + 1 else rhs(i) = rhs(i) - 0.0 end if ! center csrValA(idx) = -4.0 csrColIndA(idx) = i idx = idx + 1 ! right if ( ix < n_route-1 ) then csrValA(idx) = 1.0 csrColIndA(idx) = i + 1 idx = idx + 1 else rhs(i) = rhs(i) - 0.0 end if ! down if ( iy < n_route-1 ) then csrValA(idx) = 1.0 csrColIndA(idx) = i + n_route idx = idx + 1 else rhs(i) = rhs(i) - 0.0 end if end do csrRowPtrA(n+1) = idx ! check the number of non-zero element print '(1x, a, 1x, i10, 1x , a, i10)', " Matrix size =", n, " x ", n print *, " Check a number of non-zero element" print *, " idx =", idx, "nz =", nz ! allocate X, rhs allocate( X(n) ) end subroutine laplace_2nd_matgen !============================================== subroutine CG(n) !---------------------------------------------- ! Conjugate gradient without preconditioning !---------------------------------------------- use precision_m use measure_time use cublas_v2 use cusparse implicit none integer :: n ! size of matrix type(cusparseHandle) :: cusparse_H type(cublasHandle) :: cublas_H ! type(cusparseMatDescr) :: descrA ! Use Newer cuSPARSE API type(cusparseSpMatDescr) :: descrA type(cusparseDnVecDescr) :: vecP, vecQ, vecR, vecX integer, allocatable, dimension(:) :: bufferMV integer*8 :: bufferSizeMV integer :: cusparse_status integer :: cublas_status integer :: status !, ierr_code real(fp_kind) :: r0, r1, alpha, alpham1, beta real(fp_kind) :: dot, nalpha real(fp_kind) :: error, rsum, diff real(fp_kind), allocatable, dimension(:) :: r real(fp_kind), allocatable, dimension(:) :: p real(fp_kind), allocatable, dimension(:) :: q real(fp_kind),parameter:: one = 1.0, zero = 0.0, onem1 = -1.0 integer :: i, j, k allocate( r(n), p(n) ) allocate( q(n) ) ! Conjugate gradient without preconditioning print *,"-------------------------------------------------------------" print *,"Conjugate gradient without preconditioning" print *,"-------------------------------------------------------------" ! Initilize X, rhs X = 0.0 ! or Initial guess X is set ! Copy rhs to r (Initial value r for rhs which is "b" vector) do i = 1, n r(i) = rhs(i) ! set r = rhs, where Ax = rhs (Ax = b) end do ! create handle for cuBLAS cublas_status = cublasCreate(cublas_H) ! create handle for CUSPARSE and matrix A descriptor cusparse_status = cusparseCreate(cusparse_H) if (cusparse_status /= CUSPARSE_STATUS_SUCCESS) write(*,*) 'cusparseCreate error: ', status !! OLD API !! status = cusparseCreateMatDescr(descrA) ! matrix A properties !! status = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) !! status = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE) ! base = 1 !$acc data copy(r, X) copyin(csrValA, csrRowPtrA, csrColIndA) & !$acc create(p, q) !$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, p, q, X, r) ! Use Newer API cusparse_status = cusparseCreateDnVec(vecP, n, p, CUDA_R_64F) cusparse_status = cusparseCreateDnVec(vecQ, n, q, CUDA_R_64F) cusparse_status = cusparseCreateDnVec(vecR, n, r, CUDA_R_64F) cusparse_status = cusparseCreateDnVec(vecX, n, X, CUDA_R_64F) cusparse_status = cusparseCreateCsr(descrA, n, n, nz, csrRowPtrA, csrColIndA, csrValA, & CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F) ! base 1 alpha = 1.0 beta = 0.0 k = 0 r0 = 0.0 status = init_clock status = start_clock cusparse_status = cusparseSpMV_bufferSize( & cusparse_H, CUSPARSE_OPERATION_NON_TRANSPOSE, & one, descrA, vecX, zero, vecQ, & CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, bufferSizeMV) allocate(bufferMV(bufferSizeMV/4)) !$acc data create(bufferMV(1:bufferSizeMV/4)) !$acc host_data use_device(bufferMV) ! === Initial guess x0 and residual r1 = r - Ax0 ! !$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, q, X, r) !! OLD API !! cusparse_status = cusparseDcsrmv(cusparse_H,CUSPARSE_OPERATION_NON_TRANSPOSE, & !! n, n, nz, one, descrA, csrValA, csrRowPtrA, csrColIndA, X, zero, q) ! q = A x0 ! Use Newer API cusparse_status = cusparseSpMV(cusparse_H,CUSPARSE_OPERATION_NON_TRANSPOSE, & one, descrA, vecX, zero, vecQ, & CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, bufferMV) ! q = A x0 cublas_status = cublasDaxpy(cublas_H, n, onem1, q, 1, r, 1) ! r = r - q cublas_status = cublasDaxpy(cublas_H, n, onem1, q, 1, r, 1) ! r = r - q cublas_status = cublasDdot (cublas_H, n, r, 1, r, 1, r1) ! r1= (r, r) do while ( r1 > tol*tol .and. k < max_iter ) k = k + 1 if ( k == 1 ) then !$acc host_data use_device(r,p) cublas_status = cublasDcopy(cublas_H, n, r, 1, p, 1) ! p = r !$acc end host_data else beta = r1/r0; ! beta = r(i) / r(i-1) ! p = r + beta * p !$acc host_data use_device(r,p) cublas_status = cublasDscal(cublas_H, n, beta, p, 1) ! p = beta*p cublas_status = cublasDaxpy(cublas_H, n, one, r, 1, p, 1) ! p = r + p !$acc end host_data end if ! Compute q = A p !! Old API !! cusparse_status = cusparseDcsrmv(cusparse_H,CUSPARSE_OPERATION_NON_TRANSPOSE, & !! n, n, nz, one, descrA, csrValA, csrRowPtrA, csrColIndA, p, zero, q) ! Use Newer API cusparse_status = cusparseSpMV(cusparse_H,CUSPARSE_OPERATION_NON_TRANSPOSE, & one, descrA, vecP, zero, vecQ, & CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, bufferMV) ! q = A x0 cublas_status = cublasDdot(cublas_H, n, p, 1, q, 1, dot) ! dot = p{T} * q alpha = r1/dot ! x = x + alpha*p !$acc host_data use_device(p, X) cublas_status = cublasDaxpy(cublas_H, n, alpha, p, 1, X, 1) !$acc end host_data nalpha = -alpha ! r = r - alpha*q !$acc host_data use_device(q, r) cublas_status = cublasDaxpy(cublas_H, n, nalpha, q, 1, r, 1) !$acc end host_data r0 = r1 ! r0 = r1 !$acc host_data use_device(r) cublas_status = cublasDdot(cublas_H, n, r, 1, r, 1, r1) ! r1 = (r, r) !$acc end host_data if (idbg == 1 ) print *, "iteration=", k, " residual =", sqrt(r1) end do !$acc end host_data !$acc end data !$acc end host_data !$acc end data status = end_clock print *, "iteration=", k, " residual =", sqrt(r1) if ( sqrt(r1) < tol ) then print *, "CG without preconditioning test is passed" else print *, "CG without preconditioning test is failed" end if time_tag = "Elapased time CG without preconditioning : " status = print_time(time_tag) ! print *, "X=",X ! check result error = 0.0 do i = 1, n rsum = 0.0 do j = csrRowPtrA(i), csrRowPtrA(i+1)-1 rsum = rsum + csrValA(j) * X(csrColIndA(j)) end do diff = abs(rsum - rhs(i)) if (diff > error) error = diff end do print *,"error =",error ! output result if (idbg == 2) then nprint = n if ( n > 10 ) nprint = 10 do i = 1, nprint print '(1x, a,i7,a,e18.12)', "x(", i, ")=", X(i) end do print *,"" do i = n-nprint, n print '(1x, a,i7,a,e18.12)', "x(", i, ")=", X(i) end do end if status = cublasDestroy(cublas_H) status = cusparseDestroy(cusparse_H) deallocate( q, r, p ) end subroutine CG !====================================================== End module CG_routines program main use precision_m use CG_routines implicit none integer :: num_mat integer :: i, j !------------------------------------------------------------- ! A matrix : sparse CSR type matrix ! N x N symmetric positive definite matrix (num_mat) !------------------------------------------------------------- num_mat = 16 ! num_mat = 16384 num_mat = 1048576 ! num_mat = 4000000 print '(/1x,a,d15.10,2x,a,i8/)'," tol = ", tol, "max_iteration =", max_iter ! Select matrix data ! call tridaiag_matgen (num_mat) call laplace_2nd_matgen (num_mat) ! Print CSR Matrix data ! print *, "csrValA", csrValA ! print *, "csrRowPtrA",csrRowPtrA ! print *, "csrColIndA", csrColIndA ! Conjugate gradient without preconditioning call CG (num_mat) ! Preconditioned Conjugate Gradient using Incomplete-LU ! factorization with 0 fill-in and no pivoting !! call ICLU_CG (num_mat) ! Preconditioned Conjugate Gradient using Incomplete-Cholesky ! factorization !! call ICCL_CG (num_mat) end program main