LCOV - code coverage report
Current view: top level - cicecore/cicedyn/infrastructure/comm/mpi - ice_reprosum.F90 (source / functions) Hit Total Coverage
Test: 231018-211459:8916b9ff2c:1:quick Lines: 0 427 0.00 %
Date: 2023-10-18 15:30:36 Functions: 0 7 0.00 %

          Line data    Source code
       1             : 
       2             : !> Reproducible sum method from P. Worley
       3             : 
       4             : MODULE ice_reprosum
       5             : 
       6             : !-----------------------------------------------------------------------
       7             : !
       8             : ! Purpose:
       9             : !> Compute reproducible global sums of a set of arrays across an MPI
      10             : !> subcommunicator
      11             : !
      12             : ! Methods:
      13             : !> Compute using either or both a scalable, reproducible algorithm and a
      14             : !> scalable, nonreproducible algorithm:
      15             : !> * Reproducible (scalable):
      16             : !>    Convert to fixed point (integer vector representation) to enable
      17             : !>    reproducibility when using MPI_Allreduce
      18             : !> * Alternative usually reproducible (scalable):
      19             : !>    Use parallel double-double algorithm due to Helen He and
      20             : !>    Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
      21             : !> * Nonreproducible (scalable):
      22             : !>    Floating point and MPI_Allreduce based.
      23             : !> If computing both reproducible and nonreproducible sums, compare
      24             : !> these and report relative difference (if absolute difference
      25             : !> less than sum) or absolute difference back to calling routine.
      26             : !
      27             : ! Author: P. Worley (based on suggestions from J. White for fixed
      28             : !                    point algorithm and on He/Ding paper for ddpdd
      29             : !                    algorithm)
      30             : !
      31             : ! Modified by T.Craig for CICE, March 2019 based on the public version in
      32             : ! Oasis3-MCT_4.0.
      33             : !
      34             : !-----------------------------------------------------------------------
      35             : 
      36             : !-----------------------------------------------------------------------
      37             : !- use statements ------------------------------------------------------
      38             : !-----------------------------------------------------------------------
      39             : #ifndef SERIAL_REMOVE_MPI
      40             : use mpi   ! MPI Fortran module
      41             : #endif
      42             : #if defined (NO_I8)
      43             :    ! Workaround for when shr_kind_i8 is not supported.
      44             :    use ice_kinds_mod, only: r8 => dbl_kind, i8 => int_kind
      45             : #else
      46             :    use ice_kinds_mod, only: r8 => dbl_kind, i8 => int8_kind
      47             : #endif
      48             :    use ice_kinds_mod, only: char_len_long
      49             :    use ice_fileunits, only: nu_diag
      50             :    use ice_exit,  only: abort_ice
      51             : 
      52             : !  internal timers not yet implemented, need to revisit if needed
      53             : !   use ice_mpi,   only: xicex_mpi_barrier
      54             : !   use ice_timer, only: xicex_timer_start, xicex_timer_stop
      55             : 
      56             : !-----------------------------------------------------------------------
      57             : !- module boilerplate --------------------------------------------------
      58             : !-----------------------------------------------------------------------
      59             :    implicit none
      60             :    private
      61             : 
      62             : !-----------------------------------------------------------------------
      63             : ! Public interfaces ----------------------------------------------------
      64             : !-----------------------------------------------------------------------
      65             :    public :: &
      66             :       ice_reprosum_setopts,        &! set runtime options   ! LCOV_EXCL_LINE
      67             :       ice_reprosum_calc,           &! calculate distributed sum   ! LCOV_EXCL_LINE
      68             :       ice_reprosum_tolExceeded      ! utility function to check relative
      69             :                                       !  differences against the tolerance
      70             : 
      71             : !-----------------------------------------------------------------------
      72             : ! Public data ----------------------------------------------------------
      73             : !-----------------------------------------------------------------------
      74             :    logical, public     :: ice_reprosum_recompute = .false.
      75             : 
      76             :    real(r8), public    :: ice_reprosum_reldiffmax = -1.0_r8
      77             : 
      78             : !-----------------------------------------------------------------------
      79             : ! Private interfaces ---------------------------------------------------
      80             : !-----------------------------------------------------------------------
      81             :    private :: &
      82             :       ddpdd,           &! double-double sum routine   ! LCOV_EXCL_LINE
      83             :       split_indices     ! split indices among OMP threads
      84             : 
      85             : !-----------------------------------------------------------------------
      86             : ! Private data ---------------------------------------------------------
      87             : !-----------------------------------------------------------------------
      88             : 
      89             :    logical :: repro_sum_use_ddpdd = .false.
      90             :    logical :: detailed_timing = .false.
      91             :    character(len=char_len_long) :: tmpstr
      92             : 
      93             :    CONTAINS
      94             : 
      95             : !========================================================================
      96             : !-----------------------------------------------------------------------
      97             : ! Purpose:
      98             : !> Set runtime options
      99             : ! Author: P. Worley
     100             : !-----------------------------------------------------------------------
     101             : 
     102           0 :    subroutine ice_reprosum_setopts(repro_sum_use_ddpdd_in,    &
     103             :                                    repro_sum_rel_diff_max_in, &   ! LCOV_EXCL_LINE
     104             :                                    repro_sum_recompute_in,    &   ! LCOV_EXCL_LINE
     105             :                                    repro_sum_master,          &   ! LCOV_EXCL_LINE
     106             :                                    repro_sum_logunit          )
     107             : 
     108             : !------------------------------Arguments--------------------------------
     109             :       logical, intent(in), optional :: repro_sum_use_ddpdd_in
     110             :       !< Use DDPDD algorithm instead of fixed precision algorithm
     111             :       real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
     112             :       !< maximum permissible difference between reproducible and
     113             :       !< nonreproducible sums
     114             :       logical, intent(in), optional  :: repro_sum_recompute_in
     115             :       !< recompute using different algorithm when difference between
     116             :       !< reproducible and nonreproducible sums is too great
     117             :       logical, intent(in), optional  :: repro_sum_master
     118             :       !< flag indicating whether this process should output
     119             :       !< log messages
     120             :       integer, intent(in), optional  :: repro_sum_logunit
     121             :       !< unit number for log messages
     122             : !---------------------------Local Workspace-----------------------------
     123             :       integer llogunit           ! unit number for log messages
     124             :       logical master             ! local master?
     125             :       logical,save :: firstcall = .true.  ! first call
     126             :       character(len=*),parameter :: subname = '(ice_reprosum_setopts)'
     127             : !-----------------------------------------------------------------------
     128             : 
     129           0 :       if ( present(repro_sum_master) ) then
     130           0 :          master = repro_sum_master
     131             :       else
     132           0 :          master = .false.
     133             :       endif
     134             : 
     135           0 :       if ( present(repro_sum_logunit) ) then
     136           0 :          llogunit = repro_sum_logunit
     137             :       else
     138           0 :          llogunit = nu_diag
     139             :       endif
     140             : 
     141           0 :       if (.not. firstcall) then
     142           0 :          write(tmpstr,*) subname//' ERROR: can only be called once'
     143           0 :          call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
     144             :       endif
     145           0 :       firstcall = .false.
     146             : 
     147           0 :       if ( present(repro_sum_use_ddpdd_in) ) then
     148           0 :          repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
     149             :       endif
     150           0 :       if ( present(repro_sum_rel_diff_max_in) ) then
     151           0 :          ice_reprosum_reldiffmax = repro_sum_rel_diff_max_in
     152             :       endif
     153           0 :       if ( present(repro_sum_recompute_in) ) then
     154           0 :          ice_reprosum_recompute = repro_sum_recompute_in
     155             :       endif
     156           0 :       if (master) then
     157           0 :          if ( repro_sum_use_ddpdd ) then
     158           0 :             write(llogunit,*) subname, &
     159             :               'Using double-double-based (scalable) usually reproducible ', &   ! LCOV_EXCL_LINE
     160           0 :               'distributed sum algorithm'
     161             :          else
     162           0 :             write(llogunit,*) subname, &
     163             :               'Using fixed-point-based (scalable) reproducible ', &   ! LCOV_EXCL_LINE
     164           0 :               'distributed sum algorithm'
     165             :          endif
     166             : 
     167           0 :          if (ice_reprosum_reldiffmax >= 0._r8) then
     168           0 :             write(llogunit,*) subname, &
     169             :               ' with a maximum relative error tolerance of ', &   ! LCOV_EXCL_LINE
     170           0 :               ice_reprosum_reldiffmax
     171           0 :             if (ice_reprosum_recompute) then
     172           0 :                write(llogunit,*) subname, &
     173             :                  'If tolerance exceeded, sum is recomputed using ', &   ! LCOV_EXCL_LINE
     174           0 :                  'a serial algorithm.'
     175             :             else
     176           0 :                write(llogunit,*) subname, &
     177             :                  'If tolerance exceeded, fixed-precision is sum used ', &   ! LCOV_EXCL_LINE
     178           0 :                  'but a warning is output.'
     179             :             endif
     180             :          else
     181           0 :             write(llogunit,*) subname, &
     182           0 :               'and not comparing with floating point algorithms.'
     183             :          endif
     184             : 
     185             :       endif
     186           0 :    end subroutine ice_reprosum_setopts
     187             : 
     188             : !========================================================================
     189             : 
     190             : !
     191             : ! Purpose:
     192             : !> Compute the global sum of each field in "arr" using the indicated
     193             : !> communicator with a reproducible yet scalable implementation based
     194             : !> on a fixed point algorithm. An alternative is to use an "almost
     195             : !> always reproducible" floating point algorithm.
     196             : !
     197             : ! The accuracy of the fixed point algorithm is controlled by the
     198             : ! number of "levels" of integer expansion. The algorithm will calculate
     199             : ! the number of levels that is required for the sum to be essentially
     200             : ! exact. The optional parameter arr_max_levels can be used to override
     201             : ! the calculated value. The optional parameter arr_max_levels_out can be
     202             : ! used to return the values used.
     203             : !
     204             : ! The algorithm also requires an upper bound on
     205             : ! the maximum summand (in absolute value) for each field, and will
     206             : ! calculate this internally. However, if the optional parameters
     207             : ! arr_max_levels and arr_gbl_max are both set, then the algorithm will
     208             : ! use the values in arr_gbl_max for the upper bounds instead. If these
     209             : ! are not upper bounds, or if the upper bounds are not tight enough
     210             : ! to achieve the requisite accuracy, and if the optional parameter
     211             : ! repro_sum_validate is NOT set to .false., the algorithm will repeat the
     212             : ! computation with appropriate upper bounds. If only arr_gbl_max is present,
     213             : ! then the maxima are computed internally (and the specified values are
     214             : ! ignored). The optional parameter arr_gbl_max_out can be
     215             : ! used to return the values used.
     216             : !
     217             : ! Finally, the algorithm requires an upper bound on the number of
     218             : ! local summands across all processes. This will be calculated internally,
     219             : ! using an MPI collective, but the value in the optional argument
     220             : ! gbl_max_nsummands will be used instead if (1) it is present, (2)
     221             : ! it is > 0, and (3) the maximum value and required number of levels
     222             : ! are also specified. (If the maximum value is calculated, the same
     223             : ! MPI collective is used to determine the maximum number of local
     224             : ! summands.) The accuracy of the user-specified value is not checked.
     225             : ! However, if set to < 1, the value will instead be calculated. If the
     226             : ! optional parameter gbl_max_nsummands_out is present, then the value
     227             : ! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
     228             : ! returned.
     229             : !
     230             : ! If requested (by setting ice_reprosum_reldiffmax >= 0.0 and passing in
     231             : ! the optional rel_diff parameter), results are compared with a
     232             : ! nonreproducible floating point algorithm.
     233             : !
     234             : ! Note that the cost of the algorithm is not strongly correlated with
     235             : ! the number of levels, which primarily shows up as a (modest) increase
     236             : ! in cost of the MPI_Allreduce as a function of vector length. Rather the
     237             : ! cost is more a function of (a) the number of integers required to
     238             : ! represent an individual summand and (b) the number of MPI_Allreduce
     239             : ! calls. The number of integers required to represent an individual
     240             : ! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
     241             : ! when the number of local summands is not too large. As the number of
     242             : ! local summands increases, the number of integers required increases.
     243             : ! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
     244             : ! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
     245             : ! correctly). When specifying arr_max_levels and arr_gbl_max
     246             : ! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
     247             : !
     248             : ! The alternative algorithm is a minor modification of a parallel
     249             : ! implementation of David Bailey's routine DDPDD by Helen He
     250             : ! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
     251             : ! precision summation of double precision values with 10 double
     252             : ! precision operations. The advantage of this algorithm is that
     253             : ! it requires a single MPI_Allreduce and is less expensive per summand
     254             : ! than is the fixed precision algorithm. The disadvantage is that it
     255             : ! is not guaranteed to be reproducible (though it is reproducible
     256             : ! much more often than is the standard algorithm). This alternative
     257             : ! is used when the optional parameter ddpdd_sum is set to .true. It is
     258             : ! also used if the fixed precision algorithm radix assumption does not
     259             : ! hold.
     260             : 
     261             : !----------------------------------------------------------------------
     262           0 :    subroutine ice_reprosum_calc (arr, arr_gsum, nsummands, dsummands,     &
     263             :                                  nflds, ddpdd_sum,                        &   ! LCOV_EXCL_LINE
     264             :                                  arr_gbl_max, arr_gbl_max_out,            &   ! LCOV_EXCL_LINE
     265             :                                  arr_max_levels, arr_max_levels_out,      &   ! LCOV_EXCL_LINE
     266             :                                  gbl_max_nsummands, gbl_max_nsummands_out,&   ! LCOV_EXCL_LINE
     267             :                                  gbl_count, repro_sum_validate,           &   ! LCOV_EXCL_LINE
     268           0 :                                  repro_sum_stats, rel_diff, commid        )
     269             : !----------------------------------------------------------------------
     270             : 
     271             : ! Arguments
     272             : 
     273             :       integer,  intent(in) :: nsummands  !< number of local summands
     274             :       integer,  intent(in) :: dsummands  !< declared first dimension
     275             :       integer,  intent(in) :: nflds      !< number of fields
     276             :       real(r8), intent(in) :: arr(dsummands,nflds)
     277             :                                          !< input array
     278             : 
     279             :       real(r8), intent(out):: arr_gsum(nflds)
     280             :                                          !< global means
     281             : 
     282             :       logical,  intent(in),    optional :: ddpdd_sum
     283             :                                          !< use ddpdd algorithm instead
     284             :                                          !< of fixed precision algorithm
     285             : 
     286             :       real(r8), intent(in),    optional :: arr_gbl_max(nflds)
     287             :                                          !< upper bound on max(abs(arr))
     288             : 
     289             :       real(r8), intent(out),   optional :: arr_gbl_max_out(nflds)
     290             :                                          !< calculated upper bound on
     291             :                                          !< max(abs(arr))
     292             : 
     293             :       integer,  intent(in),    optional :: arr_max_levels(nflds)
     294             :                                          !< maximum number of levels of
     295             :                                          !< integer expansion to use
     296             : 
     297             :       integer,  intent(out),   optional :: arr_max_levels_out(nflds)
     298             :                                          !< output of number of levels of
     299             :                                          !< integer expansion to used
     300             : 
     301             :       integer,  intent(in),    optional :: gbl_max_nsummands
     302             :                                          !< maximum of nsummand over all
     303             :                                          !< processes
     304             : 
     305             :       integer,  intent(out),   optional :: gbl_max_nsummands_out
     306             :                                          !< calculated maximum nsummands
     307             :                                          !< over all processes
     308             : 
     309             :       integer,  intent(in),    optional :: gbl_count
     310             :                                          !< was total number of summands;
     311             :                                          !< now is ignored; use
     312             :                                          !< gbl_max_nsummands instead
     313             : 
     314             :       logical,  intent(in),    optional :: repro_sum_validate
     315             :          !< flag enabling/disabling testing that gmax and  max_levels are
     316             :          !< accurate/sufficient. Default is enabled.
     317             : 
     318             :       integer,  intent(inout), optional :: repro_sum_stats(5)
     319             :                                    !< increment running totals for
     320             :                                    !<  (1) one-reduction repro_sum
     321             :                                    !<  (2) two-reduction repro_sum
     322             :                                    !<  (3) both types in one call
     323             :                                    !<  (4) nonrepro_sum
     324             :                                    !<  (5) global max nsummands reduction
     325             : 
     326             :       real(r8), intent(out),   optional :: rel_diff(2,nflds)
     327             :                                          !< relative and absolute
     328             :                                          !<  differences between fixed
     329             :                                          !<  and floating point sums
     330             : 
     331             :       integer,  intent(in),    optional :: commid
     332             :                                          !< MPI communicator
     333             : 
     334             : ! Local workspace
     335             : 
     336             :       logical :: use_ddpdd_sum           ! flag indicating whether to
     337             :                                          !  use ice_reprosum_ddpdd or not
     338             :       logical :: recompute               ! flag indicating need to
     339             :                                          !  determine gmax/gmin before
     340             :                                          !  computing sum
     341             :       logical :: validate                ! flag indicating need to
     342             :                                          !  verify gmax and max_levels
     343             :                                          !  are accurate/sufficient
     344             :       integer :: omp_nthreads            ! number of OpenMP threads
     345             :       integer :: mpi_comm                ! MPI subcommunicator
     346             :       integer :: tasks                   ! number of MPI processes
     347             :       integer :: mype                    ! MPI task rank
     348             :       integer :: ierr                    ! MPI error return
     349             :       integer :: ifld, isum, ithread     ! loop variables
     350             :       integer :: max_nsummands           ! max nsummands over all processes
     351             :                                          !  or threads (used in both ways)
     352             : 
     353           0 :       integer, allocatable :: isum_beg(:), isum_end(:)
     354             :                                          ! range of summand indices for each
     355             :                                          !  OpenMP thread
     356           0 :       integer, allocatable :: arr_tlmin_exp(:,:)
     357             :                                          ! per thread local exponent minima
     358           0 :       integer, allocatable :: arr_tlmax_exp(:,:)
     359             :                                          ! per thread local exponent maxima
     360             :       integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
     361             :                                          ! summand exponent and working min/max
     362           0 :       integer :: arr_lmin_exp(nflds)     ! local exponent minima
     363           0 :       integer :: arr_lmax_exp(nflds)     ! local exponent maxima
     364           0 :       integer :: arr_lextremes(0:nflds,2)! local exponent extrema
     365           0 :       integer :: arr_gextremes(0:nflds,2)! global exponent extrema
     366             : 
     367           0 :       integer :: arr_gmax_exp(nflds)     ! global exponents maxima
     368           0 :       integer :: arr_gmin_exp(nflds)     ! global exponents minima
     369             :       integer :: arr_max_shift           ! maximum safe exponent for
     370             :                                          !  value < 1 (so that sum does
     371             :                                          !  not overflow)
     372           0 :       integer :: max_levels(nflds)       ! maximum number of levels of
     373             :                                          !  integer expansion to use
     374             :       integer :: max_level               ! maximum value in max_levels
     375             :       integer :: gbl_max_red             ! global max local sum reduction? (0/1)
     376             :       integer :: repro_sum_fast          ! 1 reduction repro_sum? (0/1)
     377             :       integer :: repro_sum_slow          ! 2 reduction repro_sum? (0/1)
     378             :       integer :: repro_sum_both          ! both fast and slow? (0/1)
     379             :       integer :: nonrepro_sum            ! nonrepro_sum? (0/1)
     380             : 
     381           0 :       real(r8) :: xmax_nsummands         ! dble of max_nsummands
     382           0 :       real(r8) :: arr_lsum(nflds)        ! local sums
     383           0 :       real(r8) :: arr_gsum_fast(nflds)   ! global sum calculated using
     384             :                                          !  fast, nonreproducible,
     385             :                                          !  floating point alg.
     386           0 :       real(r8) :: abs_diff               ! absolute difference between
     387             :                                          !  fixed and floating point
     388             :                                          !  sums
     389             : #ifdef _OPENMP
     390             :       integer omp_get_max_threads
     391             :       external omp_get_max_threads
     392             : #endif
     393             :       character(len=*),parameter :: subname = '(ice_reprosum_calc)'
     394             : 
     395             : !-----------------------------------------------------------------------
     396             : 
     397             : ! check whether should use ice_reprosum_ddpdd algorithm
     398           0 :       use_ddpdd_sum = repro_sum_use_ddpdd
     399           0 :       if ( present(ddpdd_sum) ) then
     400           0 :          use_ddpdd_sum = ddpdd_sum
     401             :       endif
     402             : 
     403             : ! check whether intrinsic-based algorithm will work on this system
     404             : ! (requires floating point and integer bases to be the same)
     405             : ! If not, always use ddpdd.
     406           0 :       use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
     407             : 
     408             : ! initialize local statistics variables
     409           0 :       gbl_max_red = 0
     410           0 :       repro_sum_fast = 0
     411           0 :       repro_sum_slow = 0
     412           0 :       repro_sum_both = 0
     413           0 :       nonrepro_sum = 0
     414             : 
     415             : ! set MPI communicator
     416           0 :       if ( present(commid) ) then
     417           0 :          mpi_comm = commid
     418             :       else
     419             : #ifdef SERIAL_REMOVE_MPI
     420             :          mpi_comm = 0
     421             : #else
     422           0 :          mpi_comm = MPI_COMM_WORLD
     423             : #endif
     424             :       endif
     425             : 
     426             : !      if (detailed_timing) then
     427             : !         call xicex_timer_start('xicex_reprosum_prebarrier')
     428             : !         call xicex_mpi_barrier(mpi_comm,subname)
     429             : !         call xicex_timer_stop ('xicex_reprosum_prebarrier')
     430             : !      endif
     431             : 
     432           0 :       if ( use_ddpdd_sum ) then
     433             : 
     434             : !         if (detailed_timing) call xicex_timer_start('ice_reprosum_ddpdd')
     435             : 
     436             :          call ice_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
     437           0 :                                  nflds, mpi_comm)
     438           0 :          repro_sum_fast = 1
     439             : 
     440             : !         if (detailed_timing) call xicex_timer_stop('ice_reprosum_ddpdd')
     441             : 
     442             :       else
     443             : 
     444             : !         if (detailed_timing) call xicex_timer_start('ice_reprosum_int')
     445             : 
     446             : ! get number of MPI tasks
     447             : #ifdef SERIAL_REMOVE_MPI
     448             :          tasks = 1
     449             :          mype = 0
     450             : #else
     451           0 :          call mpi_comm_size(mpi_comm, tasks, ierr)
     452           0 :          call mpi_comm_rank(mpi_comm, mype, ierr)
     453             : #endif
     454             : 
     455             : ! get number of OpenMP threads
     456             : #ifdef _OPENMP
     457           0 :          omp_nthreads = omp_get_max_threads()
     458             : #else
     459           0 :          omp_nthreads = 1
     460             : #endif
     461             : 
     462             : ! see if have sufficient information to not require max/min allreduce
     463           0 :          recompute = .true.
     464           0 :          validate = .false.
     465           0 :          if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
     466           0 :             recompute = .false.
     467             : 
     468             : ! setting lower bound on max_level*nflds to be 64 to improve OpenMP
     469             : ! performance for loopb in ice_reprosum_int
     470           0 :             max_level = (64/nflds) + 1
     471           0 :             do ifld=1,nflds
     472           0 :                if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
     473           0 :                    (arr_max_levels(ifld) > 0)) then
     474             : 
     475           0 :                   arr_gmax_exp(ifld)  = exponent(arr_gbl_max(ifld))
     476           0 :                   if (max_level < arr_max_levels(ifld)) &
     477           0 :                      max_level = arr_max_levels(ifld)
     478             : 
     479             :                else
     480           0 :                   recompute = .true.
     481             :                endif
     482             :             enddo
     483             : 
     484           0 :             if (.not. recompute) then
     485             : 
     486             : ! determine maximum number of summands in local phases of the
     487             : ! algorithm
     488             : !               if (detailed_timing) call xicex_timer_start("repro_sum_allr_max")
     489           0 :                if ( present(gbl_max_nsummands) ) then
     490           0 :                   if (gbl_max_nsummands < 1) then
     491             : #ifdef SERIAL_REMOVE_MPI
     492             :                      max_nsummands = nsummands
     493             : #else
     494             :                      call mpi_allreduce (nsummands, max_nsummands, 1, &
     495           0 :                                          MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
     496             : #endif
     497           0 :                      gbl_max_red = 1
     498             :                   else
     499           0 :                      max_nsummands = gbl_max_nsummands
     500             :                   endif
     501             :                else
     502             : #ifdef SERIAL_REMOVE_MPI
     503             :                   max_nsummands = nsummands
     504             : #else
     505             :                   call mpi_allreduce (nsummands, max_nsummands, 1, &
     506           0 :                                       MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
     507             : #endif
     508           0 :                   gbl_max_red = 1
     509             :                endif
     510             : !               if (detailed_timing) call xicex_timer_stop("repro_sum_allr_max")
     511             : 
     512             : ! determine maximum shift. Shift needs to be small enough that summation
     513             : !  does not exceed maximum number of digits in i8.
     514             : 
     515             : ! if requested, return max_nsummands before it is redefined
     516           0 :                if ( present( gbl_max_nsummands_out) ) then
     517           0 :                   gbl_max_nsummands_out = max_nsummands
     518             :                endif
     519             : 
     520             : ! summing within each thread first
     521           0 :                max_nsummands = (max_nsummands/omp_nthreads) + 1
     522             : ! then over threads and tasks
     523           0 :                max_nsummands = max(max_nsummands, tasks*omp_nthreads)
     524             : 
     525           0 :                xmax_nsummands = dble(max_nsummands)
     526           0 :                arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
     527           0 :                if (arr_max_shift < 2) then
     528           0 :                   write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
     529           0 :                   call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
     530             :                endif
     531             : 
     532             : ! calculate sum
     533           0 :                if (present(repro_sum_validate)) then
     534           0 :                   validate = repro_sum_validate
     535             :                else
     536           0 :                   validate = .true.
     537             :                endif
     538             :                call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
     539             :                                      nflds, arr_max_shift, arr_gmax_exp, &   ! LCOV_EXCL_LINE
     540             :                                      arr_max_levels, max_level, validate, &   ! LCOV_EXCL_LINE
     541           0 :                                      recompute, omp_nthreads, mpi_comm)
     542             : 
     543             : ! record statistics, etc.
     544           0 :                repro_sum_fast = 1
     545           0 :                if (recompute) then
     546           0 :                   repro_sum_both = 1
     547             :                else
     548             : ! if requested, return specified levels and upper bounds on maxima
     549           0 :                   if ( present(arr_max_levels_out) ) then
     550           0 :                      do ifld=1,nflds
     551           0 :                         arr_max_levels_out(ifld) = arr_max_levels(ifld)
     552             :                      enddo
     553             :                   endif
     554           0 :                   if ( present(arr_gbl_max_out) ) then
     555           0 :                      do ifld=1,nflds
     556           0 :                        arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
     557             :                      enddo
     558             :                   endif
     559             :                endif
     560             :             endif
     561             :          endif
     562             : 
     563             : ! do not have sufficient information; calculate global max/min and
     564             : ! use to compute required number of levels
     565           0 :          if (recompute) then
     566             : 
     567             : ! record statistic
     568           0 :             repro_sum_slow = 1
     569             : 
     570             : ! determine maximum and minimum (non-zero) summand values and
     571             : ! maximum number of local summands
     572             : 
     573             : ! allocate thread-specific work space
     574           0 :             allocate(arr_tlmax_exp(nflds,omp_nthreads))
     575           0 :             allocate(arr_tlmin_exp(nflds,omp_nthreads))
     576           0 :             allocate(isum_beg(omp_nthreads))
     577           0 :             allocate(isum_end(omp_nthreads))
     578             : 
     579             : ! split summand index range over OpenMP threads
     580           0 :             call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
     581             : 
     582             : !$omp parallel do      &
     583             : !$omp default(shared)  &   ! LCOV_EXCL_LINE
     584           0 : !$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
     585           0 :             do ithread=1,omp_nthreads
     586             : !               if (detailed_timing) call xicex_timer_start('repro_sum_loopa')
     587           0 :                do ifld=1,nflds
     588           0 :                   arr_exp_tlmin = MAXEXPONENT(1._r8)
     589           0 :                   arr_exp_tlmax = MINEXPONENT(1._r8)
     590           0 :                   do isum=isum_beg(ithread),isum_end(ithread)
     591           0 :                      if (arr(isum,ifld) .ne. 0.0_r8) then
     592           0 :                         arr_exp = exponent(arr(isum,ifld))
     593           0 :                         arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
     594           0 :                         arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
     595             :                      endif
     596             :                   end do
     597           0 :                   arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
     598           0 :                   arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
     599             :                end do
     600             : !               if (detailed_timing) call xicex_timer_stop('repro_sum_loopa')
     601             :             end do
     602             : 
     603           0 :             do ifld=1,nflds
     604           0 :                arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
     605           0 :                arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
     606             :             end do
     607           0 :             deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
     608             : 
     609           0 :             arr_lextremes(0,:) = -nsummands
     610           0 :             arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
     611           0 :             arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
     612             : !            if (detailed_timing) call xicex_timer_start("repro_sum_allr_minmax")
     613             : #ifdef SERIAL_REMOVE_MPI
     614             :             arr_gextremes = arr_lextremes
     615             : #else
     616             :             call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
     617           0 :                                 MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
     618             : #endif
     619             : !            if (detailed_timing) call xicex_timer_stop("repro_sum_allr_minmax")
     620           0 :             max_nsummands   = -arr_gextremes(0,1)
     621           0 :             arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
     622           0 :             arr_gmin_exp(:) =  arr_gextremes(1:nflds,2)
     623             : 
     624             : ! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
     625             : !   and arr_gmax_exp still equals MINEXPONENT. In this case, set
     626             : !   arr_gmin_exp = arr_gmax_exp = MINEXPONENT
     627           0 :             do ifld=1,nflds
     628           0 :                arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
     629             :             enddo
     630             : 
     631             : ! if requested, return upper bounds on observed maxima
     632           0 :             if ( present(arr_gbl_max_out) ) then
     633           0 :                do ifld=1,nflds
     634           0 :                   arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
     635             :                enddo
     636             :             endif
     637             : 
     638             : ! if requested, return max_nsummands before it is redefined
     639           0 :             if ( present( gbl_max_nsummands_out) ) then
     640           0 :                gbl_max_nsummands_out = max_nsummands
     641             :             endif
     642             : 
     643             : ! determine maximum shift (same as in previous branch, but with calculated
     644             : !  max_nsummands). Shift needs to be small enough that summation does not
     645             : !  exceed maximum number of digits in i8.
     646             : 
     647             : ! summing within each thread first
     648           0 :             max_nsummands = (max_nsummands/omp_nthreads) + 1
     649             : ! then over threads and tasks
     650           0 :             max_nsummands = max(max_nsummands, tasks*omp_nthreads)
     651             : 
     652           0 :             xmax_nsummands = dble(max_nsummands)
     653           0 :             arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
     654           0 :             if (arr_max_shift < 2) then
     655           0 :                write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
     656           0 :                call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
     657             :             endif
     658             : 
     659             : ! determine maximum number of levels required for each field
     660             : ! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
     661             : ! + 1 because first truncation probably does not involve a maximal shift
     662             : ! + 1 to guarantee that the integer division rounds up (not down)
     663             : ! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
     664             : !  performance for loopb in ice_reprosum_int)
     665           0 :             max_level = (64/nflds) + 1
     666           0 :             do ifld=1,nflds
     667           0 :                max_levels(ifld) = 2 + &
     668             :                 ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &   ! LCOV_EXCL_LINE
     669           0 :                 / arr_max_shift)
     670           0 :                if ( present(arr_max_levels) .and. (.not. validate) ) then
     671             : ! if validate true, then computation with arr_max_levels failed
     672             : !  previously
     673           0 :                   if ( arr_max_levels(ifld) > 0 ) then
     674           0 :                      max_levels(ifld) = &
     675           0 :                         min(arr_max_levels(ifld),max_levels(ifld))
     676             :                   endif
     677             :                endif
     678           0 :                if (max_level < max_levels(ifld)) &
     679           0 :                   max_level = max_levels(ifld)
     680             :             enddo
     681             : 
     682             : ! if requested, return calculated levels
     683           0 :             if ( present(arr_max_levels_out) ) then
     684           0 :                do ifld=1,nflds
     685           0 :                   arr_max_levels_out(ifld) = max_levels(ifld)
     686             :                enddo
     687             :             endif
     688             : 
     689             : ! calculate sum
     690           0 :             validate = .false.
     691             :             call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
     692             :                                   arr_max_shift, arr_gmax_exp, max_levels, &   ! LCOV_EXCL_LINE
     693             :                                   max_level, validate, recompute, &   ! LCOV_EXCL_LINE
     694           0 :                                   omp_nthreads, mpi_comm)
     695             : 
     696             :          endif
     697             : 
     698             : !         if (detailed_timing) call xicex_timer_stop('ice_reprosum_int')
     699             : 
     700             :       endif
     701             : 
     702             : ! compare fixed and floating point results
     703           0 :       if ( present(rel_diff) ) then
     704           0 :          if (ice_reprosum_reldiffmax >= 0.0_r8) then
     705             : 
     706             : !            if (detailed_timing) then
     707             : !               call xicex_timer_start('xicex_nonreprosum_prebarrier')
     708             : !               call xicex_mpi_barrier(mpi_comm,subname)
     709             : !               call xicex_timer_stop ('xicex_nonreprosum_prebarrier')
     710             : !            endif
     711             : 
     712             : !            if (detailed_timing) call xicex_timer_start('nonrepro_sum')
     713             : ! record statistic
     714           0 :             nonrepro_sum = 1
     715             : ! compute nonreproducible sum
     716           0 :             arr_lsum(:) = 0._r8
     717             : !$omp parallel do      &
     718             : !$omp default(shared)  &   ! LCOV_EXCL_LINE
     719           0 : !$omp private(ifld, isum)
     720           0 :             do ifld=1,nflds
     721           0 :                do isum=1,nsummands
     722           0 :                   arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
     723             :                end do
     724             :             end do
     725             : 
     726             : #ifdef SERIAL_REMOVE_MPI
     727             :             arr_gsum_fast = arr_lsum
     728             : #else
     729             :             call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
     730           0 :                                 MPI_REAL8, MPI_SUM, mpi_comm, ierr)
     731             : #endif
     732             : 
     733             : !            if (detailed_timing) call xicex_timer_stop('nonrepro_sum')
     734             : 
     735             : ! determine differences
     736             : !$omp parallel do      &
     737             : !$omp default(shared)  &   ! LCOV_EXCL_LINE
     738           0 : !$omp private(ifld, abs_diff)
     739           0 :             do ifld=1,nflds
     740           0 :                abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
     741           0 :                if (abs(arr_gsum(ifld)) > abs_diff) then
     742           0 :                   rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
     743             :                else
     744           0 :                   rel_diff(1,ifld) = abs_diff
     745             :                endif
     746           0 :                rel_diff(2,ifld) = abs_diff
     747             :             enddo
     748             :          else
     749           0 :             rel_diff(:,:) = 0.0_r8
     750             :          endif
     751             :       endif
     752             : 
     753             : ! return statistics
     754           0 :       if ( present(repro_sum_stats) ) then
     755           0 :          repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
     756           0 :          repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
     757           0 :          repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
     758           0 :          repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
     759           0 :          repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
     760             :       endif
     761             : 
     762             : 
     763           0 :    end subroutine ice_reprosum_calc
     764             : 
     765             : !========================================================================
     766             : !
     767             : ! Purpose:
     768             : !> Compute the global sum of each field in "arr" using the indicated
     769             : !> communicator with a reproducible yet scalable implementation based
     770             : !> on a fixed point algorithm. The accuracy of the fixed point algorithm
     771             : !> is controlled by the number of "levels" of integer expansion, the
     772             : !> maximum value of which is specified by max_level.
     773             : !
     774             : !----------------------------------------------------------------------
     775             : 
     776           0 :    subroutine ice_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
     777             :                                 arr_max_shift, arr_gmax_exp, max_levels,    &   ! LCOV_EXCL_LINE
     778             :                                 max_level, validate, recompute,             &   ! LCOV_EXCL_LINE
     779             :                                 omp_nthreads, mpi_comm                      )
     780             : 
     781             : !----------------------------------------------------------------------
     782             : 
     783             : ! Arguments
     784             : 
     785             :       integer,  intent(in) :: nsummands     !< number of local summands
     786             :       integer,  intent(in) :: dsummands     !< declared first dimension
     787             :       integer,  intent(in) :: nflds         !< number of fields
     788             :       integer,  intent(in) :: arr_max_shift !< maximum safe exponent for
     789             :                                             !<  value < 1 (so that sum
     790             :                                             !<  does not overflow)
     791             :       integer,  intent(in) :: arr_gmax_exp(nflds)
     792             :                                             !< exponents of global maxima
     793             :       integer,  intent(in) :: max_levels(nflds)
     794             :                                             !< maximum number of levels
     795             :                                             !<  of integer expansion
     796             :       integer,  intent(in) :: max_level     !< maximum value in
     797             :                                             !<  max_levels
     798             :       integer,  intent(in) :: omp_nthreads  !< number of OpenMP threads
     799             :       integer,  intent(in) :: mpi_comm      !< MPI subcommunicator
     800             : 
     801             :       real(r8), intent(in) :: arr(dsummands,nflds)
     802             :                                              !< input array
     803             : 
     804             :       logical,  intent(in):: validate
     805             :          !< flag indicating that accuracy of solution generated from
     806             :          !< arr_gmax_exp and max_levels should be tested
     807             : 
     808             :       logical,  intent(out):: recompute
     809             :          !< flag indicating that either the upper bounds are inaccurate,
     810             :          !<  or max_levels and arr_gmax_exp do not generate accurate
     811             :          !<  enough sums
     812             : 
     813             :       real(r8), intent(out):: arr_gsum(nflds)      !< global means
     814             : 
     815             : ! Local workspace
     816             : 
     817             :       integer, parameter  :: max_jlevel = &
     818             :                                 1 + (digits(0_i8)/digits(0.0_r8))
     819             : 
     820           0 :       integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
     821             :                                    ! integer vector representing local
     822             :                                    !  sum (per thread, per field)
     823           0 :       integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
     824             :                                    ! integer vector representing local
     825             :                                    !  sum
     826             :       integer(i8) :: i8_arr_level  ! integer part of summand for current
     827             :                                    !  expansion level
     828           0 :       integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
     829             :                                    ! integer vector representing global
     830             :                                    !  sum
     831             :       integer(i8) :: IX_8          ! integer representation of current
     832             :                                    !  jlevels of X_8 ('part' of
     833             :                                    !  i8_arr_gsum_level)
     834             :       integer(i8) :: i8_sign       ! sign global sum
     835             :       integer(i8) :: i8_radix      ! radix for i8 variables
     836             : 
     837           0 :       integer :: max_error(nflds,omp_nthreads)
     838             :                                    ! accurate upper bound on data?
     839           0 :       integer :: not_exact(nflds,omp_nthreads)
     840             :                                    ! max_levels sufficient to
     841             :                                    !  capture all digits?
     842           0 :       integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
     843             :                                    ! range of summand indices for each
     844             :                                    !  OpenMP thread
     845             :       integer :: ifld, isum, ithread
     846             :                                    ! loop variables
     847             :       integer :: arr_exp           ! exponent of summand
     848             :       integer :: arr_shift         ! exponent used to generate integer
     849             :                                    !  for current expansion level
     850             :       integer :: ilevel            ! current integer expansion level
     851           0 :       integer :: offset(nflds)     ! beginning location in
     852             :                                    !  i8_arr_{g,l}sum_level for integer
     853             :                                    !  expansion of current ifld
     854             :       integer :: voffset           ! modification to offset used to
     855             :                                    !  include validation metrics
     856             :       integer :: ioffset           ! offset(ifld)
     857             :       integer :: jlevel            ! number of floating point 'pieces'
     858             :                                    !  extracted from a given i8 integer
     859             :       integer :: ierr              ! MPI error return
     860             :       integer :: LX(max_jlevel)    ! exponent of X_8 (see below)
     861             :       integer :: veclth            ! total length of i8_arr_lsum_level
     862             :       integer :: sum_digits        ! lower bound on number of significant
     863             :                                    !  in integer expansion of sum
     864             :       integer :: curr_exp          ! exponent of partial sum during
     865             :                                    !  reconstruction from integer vector
     866             :       integer :: corr_exp          ! exponent of current summand in
     867             :                                    !  reconstruction from integer vector
     868             : 
     869           0 :       real(r8) :: arr_frac         ! fraction of summand
     870           0 :       real(r8) :: arr_remainder    ! part of summand remaining after
     871             :                                    !  current level of integer expansion
     872           0 :       real(r8) :: X_8(max_jlevel)  ! r8 vector representation of current
     873             :                                    !  i8_arr_gsum_level
     874           0 :       real(r8) :: RX_8             ! r8 representation of difference
     875             :                                    !  between current i8_arr_gsum_level
     876             :                                    !  and current jlevels of X_8
     877             :                                    !  (== IX_8). Also used in final
     878             :                                    !  scaling step
     879             : 
     880             :       logical :: first             ! flag used to indicate that just
     881             :                                    !  beginning reconstruction of sum
     882             :                                    !  from integer vector
     883             : 
     884             :       character(len=*),parameter :: subname = '(ice_reprosum_int)'
     885             : 
     886             : !-----------------------------------------------------------------------
     887             : ! Save radix of i8 variables in an i8 variable
     888           0 :       i8_radix = radix(IX_8)
     889             : 
     890             : ! If validating upper bounds, reserve space for validation metrics
     891             : ! In both cases, reserve an extra level for overflow from the top level
     892           0 :       if (validate) then
     893           0 :         voffset = 3
     894             :       else
     895           0 :         voffset = 1
     896             :       endif
     897             : 
     898             : ! compute offsets for each field
     899           0 :       offset(1) = voffset
     900           0 :       do ifld=2,nflds
     901           0 :          offset(ifld) = offset(ifld-1) &
     902           0 :                         + (max_levels(ifld-1) + voffset)
     903             :       enddo
     904           0 :       veclth = offset(nflds) + max_levels(nflds)
     905             : 
     906             : ! split summand index range over OpenMP threads
     907           0 :       call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
     908             : 
     909             : ! convert local summands to vector of integers and sum
     910             : ! (Using scale instead of set_exponent because arr_remainder may not be
     911             : !  "normal" after level 1 calculation)
     912           0 :       i8_arr_lsum_level(:) = 0_i8
     913             : 
     914             : !$omp parallel do      &
     915             : !$omp default(shared)  &   ! LCOV_EXCL_LINE
     916             : !$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, &   ! LCOV_EXCL_LINE
     917           0 : !$omp         arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
     918           0 :       do ithread=1,omp_nthreads
     919             : !       if (detailed_timing) call xicex_timer_start('repro_sum_loopb')
     920           0 :        do ifld=1,nflds
     921           0 :           ioffset = offset(ifld)
     922             : 
     923           0 :           max_error(ifld,ithread) = 0
     924           0 :           not_exact(ifld,ithread) = 0
     925             : 
     926           0 :           i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
     927           0 :           do isum=isum_beg(ithread),isum_end(ithread)
     928           0 :             arr_remainder = 0.0_r8
     929             : 
     930           0 :             if (arr(isum,ifld) .ne. 0.0_r8) then
     931           0 :                arr_exp   = exponent(arr(isum,ifld))
     932           0 :                arr_frac  = fraction(arr(isum,ifld))
     933             : 
     934             : ! test that global maximum upper bound is an upper bound
     935           0 :                if (arr_exp > arr_gmax_exp(ifld)) then
     936           0 :                   max_error(ifld,ithread) = 1
     937           0 :                   exit
     938             :                endif
     939             : 
     940             : ! calculate first shift
     941           0 :                arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
     942             : 
     943             : ! determine first (probably) nonzero level (assuming initial fraction is
     944             : !  'normal' - algorithm still works if this is not true)
     945             : !  NOTE: this is critical; scale will set to zero if min exponent is too small.
     946           0 :                if (arr_shift < 1) then
     947           0 :                   ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
     948           0 :                   arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
     949             : 
     950           0 :                   do while (arr_shift < 1)
     951           0 :                      arr_shift = arr_shift + arr_max_shift
     952           0 :                      ilevel = ilevel + 1
     953             :                   enddo
     954             :                else
     955           0 :                   ilevel = 1
     956             :                endif
     957             : 
     958           0 :                if (ilevel .le. max_levels(ifld)) then
     959             : ! apply first shift/truncate, add it to the relevant running
     960             : ! sum, and calculate the remainder.
     961           0 :                   arr_remainder = scale(arr_frac,arr_shift)
     962           0 :                   i8_arr_level = int(arr_remainder,i8)
     963             :                   i8_arr_tlsum_level(ilevel,ifld,ithread) = &
     964           0 :                      i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
     965           0 :                   arr_remainder = arr_remainder - i8_arr_level
     966             : 
     967             : ! while the remainder is non-zero, continue to shift, truncate,
     968             : ! sum, and calculate new remainder
     969             :                   do while ((arr_remainder .ne. 0.0_r8) &
     970           0 :                      .and. (ilevel < max_levels(ifld)))
     971           0 :                      ilevel = ilevel + 1
     972           0 :                      arr_remainder = scale(arr_remainder,arr_max_shift)
     973           0 :                      i8_arr_level = int(arr_remainder,i8)
     974             :                      i8_arr_tlsum_level(ilevel,ifld,ithread) = &
     975           0 :                         i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
     976           0 :                      arr_remainder = arr_remainder - i8_arr_level
     977             :                   enddo
     978             : 
     979             :                endif
     980             :             endif
     981             : 
     982           0 :             if (arr_remainder .ne. 0.0_r8) then
     983           0 :                not_exact(ifld,ithread) = 1
     984             :             endif
     985             : 
     986             :           enddo
     987             : 
     988             : ! postprocess integer vector to eliminate potential for overlap in the following
     989             : ! sums over threads and processes: if value larger than or equal to
     990             : ! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
     991             : ! vector, resulting in nonoverlapping ranges for each component. Note that
     992             : ! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
     993             : ! over threads and processes do not overflow for ilevel==1.
     994           0 :           do ilevel=max_levels(ifld),1,-1
     995           0 :              RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
     996           0 :              IX_8 = int(scale(RX_8,-arr_max_shift),i8)
     997           0 :              if (IX_8 .ne. 0_i8) then
     998             :                 i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
     999           0 :                    i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
    1000           0 :                 IX_8 = IX_8*(i8_radix**arr_max_shift)
    1001             :                 i8_arr_tlsum_level(ilevel,ifld,ithread)   = &
    1002           0 :                    i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
    1003             :              endif
    1004             :           enddo
    1005             :        enddo
    1006             : !       if (detailed_timing) call xicex_timer_stop('repro_sum_loopb')
    1007             :       enddo
    1008             : 
    1009             : ! sum contributions from different threads
    1010           0 :       do ifld=1,nflds
    1011           0 :          ioffset = offset(ifld)
    1012           0 :          do ithread = 1,omp_nthreads
    1013           0 :             do ilevel = 0,max_levels(ifld)
    1014           0 :                i8_arr_lsum_level(ioffset+ilevel) = &
    1015             :                   i8_arr_lsum_level(ioffset+ilevel) &   ! LCOV_EXCL_LINE
    1016           0 :                   + i8_arr_tlsum_level(ilevel,ifld,ithread)
    1017             :             enddo
    1018             :          enddo
    1019             :       enddo
    1020             : 
    1021             : ! record if upper bound was inaccurate or if level expansion stopped
    1022             : ! before full accuracy was achieved
    1023           0 :       if (validate) then
    1024           0 :          do ifld=1,nflds
    1025           0 :             ioffset = offset(ifld)
    1026           0 :             i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:))
    1027           0 :             i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:))
    1028             :          enddo
    1029             :       endif
    1030             : 
    1031             : ! sum integer vector element-wise
    1032             : #ifdef SERIAL_REMOVE_MPI
    1033             :       i8_arr_gsum_level = i8_arr_lsum_level
    1034             : #else
    1035             : #if defined (NO_I8)
    1036             :      ! Workaround for when i8 is not supported.
    1037             : !      if (detailed_timing) call xicex_timer_start("repro_sum_allr_i4")
    1038             :       call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
    1039             :                           veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
    1040             : !      if (detailed_timing) call xicex_timer_stop("repro_sum_allr_i4")
    1041             : #else
    1042             : !      if (detailed_timing) call xicex_timer_start("repro_sum_allr_i8")
    1043             :       call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
    1044           0 :                           veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
    1045             : !      if (detailed_timing) call xicex_timer_stop("repro_sum_allr_i8")
    1046             : #endif
    1047             : #endif
    1048             : 
    1049             : ! Construct global sum from integer vector representation:
    1050             : !  1) arr_max_shift is the shift applied to fraction(arr_gmax) .
    1051             : !   When shifting back, need to "add back in" true arr_gmax exponent. This was
    1052             : !   removed implicitly by working only with the fraction .
    1053             : !  2) want to add levels into sum in reverse order (smallest to largest). However,
    1054             : !   even this can generate floating point rounding errors if signs of integers
    1055             : !   alternate. To avoid this, do some arithmetic with integer vectors so that all
    1056             : !   components have the same sign. This should keep relative difference between
    1057             : !   using different integer sizes (e.g. i8 and i4) to machine epsilon
    1058             : !  3) assignment to X_8 will usually lose accuracy since maximum integer
    1059             : !   size is greater than the max number of 'digits' in r8 value (if xmax_nsummands
    1060             : !   correction not very large). Calculate remainder and add in first (since
    1061             : !   smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits).
    1062             : !   For r4 (24 digits) may need to correct twice. Code is written in a general
    1063             : !   fashion, to work no matter how many corrections are necessary (assuming
    1064             : !   max_jlevel parameter calculation is correct).
    1065             : 
    1066           0 :       recompute = .false.
    1067           0 :       do ifld=1,nflds
    1068           0 :          arr_gsum(ifld) = 0.0_r8
    1069           0 :          ioffset = offset(ifld)
    1070             : 
    1071             : ! if validate is .true., test whether the summand upper bound
    1072             : !  was exceeded on any of the processes
    1073           0 :          if (validate) then
    1074           0 :             if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then
    1075           0 :                recompute = .true.
    1076             :             endif
    1077             :          endif
    1078             : 
    1079           0 :          if (.not. recompute) then
    1080             : 
    1081             : ! preprocess integer vector:
    1082             : !  a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap'
    1083             : !     to next larger integer in vector, resulting in nonoverlapping ranges for each
    1084             : !     component. Note that have "ilevel-1=0" level here as described above.
    1085           0 :            do ilevel=max_levels(ifld),1,-1
    1086           0 :              RX_8 = i8_arr_gsum_level(ioffset+ilevel)
    1087           0 :              IX_8 = int(scale(RX_8,-arr_max_shift),i8)
    1088           0 :              if (IX_8 .ne. 0_i8) then
    1089           0 :                i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) &
    1090           0 :                                                    + IX_8
    1091           0 :                IX_8 = IX_8*(i8_radix**arr_max_shift)
    1092           0 :                i8_arr_gsum_level(ioffset+ilevel)   = i8_arr_gsum_level(ioffset+ilevel)   &
    1093           0 :                                                    - IX_8
    1094             :              endif
    1095             :            enddo
    1096             : !  b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary
    1097             : !     so that all vector components have the same sign (eliminating loss
    1098             : !     of accuracy arising from difference of large values when
    1099             : !     reconstructing r8 sum from integer vector)
    1100           0 :            ilevel = 0
    1101           0 :            do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) &
    1102           0 :                      .and. (ilevel < max_levels(ifld)))
    1103           0 :              ilevel = ilevel + 1
    1104             :            enddo
    1105             : !
    1106           0 :            if (ilevel < max_levels(ifld)) then
    1107           0 :               if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then
    1108           0 :                  i8_sign = 1_i8
    1109             :               else
    1110           0 :                  i8_sign = -1_i8
    1111             :               endif
    1112           0 :               do jlevel=ilevel,max_levels(ifld)-1
    1113           0 :                  if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) &
    1114           0 :                      .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then
    1115           0 :                     i8_arr_gsum_level(ioffset+jlevel)   = i8_arr_gsum_level(ioffset+jlevel) &
    1116           0 :                                                         - i8_sign
    1117           0 :                     i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) &
    1118           0 :                                                         + i8_sign*(i8_radix**arr_max_shift)
    1119             :                  endif
    1120             :               enddo
    1121             :             endif
    1122             : 
    1123             : ! start with maximum shift, and work up to larger values
    1124           0 :             arr_shift = arr_gmax_exp(ifld) &
    1125           0 :                         - max_levels(ifld)*arr_max_shift
    1126           0 :             curr_exp = 0
    1127           0 :             first = .true.
    1128           0 :             do ilevel=max_levels(ifld),0,-1
    1129             : 
    1130           0 :                if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
    1131           0 :                   jlevel = 1
    1132             : 
    1133             : ! r8 representation of higher order bits in integer
    1134           0 :                   X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel)
    1135           0 :                   LX(jlevel)  = exponent(X_8(jlevel))
    1136             : 
    1137             : ! calculate remainder
    1138           0 :                   IX_8 = int(X_8(jlevel),i8)
    1139           0 :                   RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
    1140             : 
    1141             : ! repeat using remainder
    1142           0 :                   do while (RX_8 .ne. 0.0_r8)
    1143           0 :                      jlevel = jlevel + 1
    1144           0 :                      X_8(jlevel) = RX_8
    1145           0 :                      LX(jlevel) = exponent(RX_8)
    1146           0 :                      IX_8 = IX_8 + int(RX_8,i8)
    1147           0 :                      RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
    1148             :                   enddo
    1149             : 
    1150             : ! add in contributions, smaller to larger, rescaling for each
    1151             : ! addition to guarantee that exponent of working summand is always
    1152             : ! larger than minexponent
    1153           0 :                   do while (jlevel > 0)
    1154           0 :                      if (first) then
    1155           0 :                         curr_exp = LX(jlevel) + arr_shift
    1156           0 :                         arr_gsum(ifld) = fraction(X_8(jlevel))
    1157           0 :                         first = .false.
    1158             :                      else
    1159           0 :                         corr_exp = curr_exp - (LX(jlevel) + arr_shift)
    1160           0 :                         arr_gsum(ifld) = fraction(X_8(jlevel)) &
    1161           0 :                                        + scale(arr_gsum(ifld),corr_exp)
    1162           0 :                         curr_exp = LX(jlevel) + arr_shift
    1163             :                      endif
    1164           0 :                      jlevel = jlevel - 1
    1165             :                   enddo
    1166             : 
    1167             :                endif
    1168             : 
    1169           0 :                arr_shift = arr_shift + arr_max_shift
    1170             :             enddo
    1171             : 
    1172             : ! apply final exponent correction, scaling first if exponent is too small
    1173             : ! to apply directly
    1174           0 :             corr_exp = curr_exp + exponent(arr_gsum(ifld))
    1175           0 :             if (corr_exp .ge. MINEXPONENT(1._r8)) then
    1176           0 :                arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp)
    1177             :             else
    1178           0 :                RX_8 = set_exponent(arr_gsum(ifld), &
    1179           0 :                                    corr_exp-MINEXPONENT(1._r8))
    1180           0 :                arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
    1181             :             endif
    1182             : 
    1183             : ! if validate is .true. and some precision lost, test whether 'too much'
    1184             : !  was lost, due to too loose an upper bound, too stringent a limit on number
    1185             : !  of levels of expansion, cancellation, .... Calculated by comparing lower
    1186             : !  bound on number of sigificant digits with number of digits in 1.0_r8 .
    1187           0 :             if (validate) then
    1188           0 :                if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then
    1189             : 
    1190             : ! find first nonzero level and use exponent for this level, then assume all
    1191             : ! subsequent levels contribute arr_max_shift digits.
    1192           0 :                   sum_digits = 0
    1193           0 :                   do ilevel=0,max_levels(ifld)
    1194           0 :                      if (sum_digits .eq. 0) then
    1195           0 :                         if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
    1196           0 :                            X_8(1) = i8_arr_gsum_level(ioffset+ilevel)
    1197           0 :                            LX(1)  = exponent(X_8(1))
    1198           0 :                            sum_digits = LX(1)
    1199             :                         endif
    1200             :                      else
    1201           0 :                         sum_digits = sum_digits + arr_max_shift
    1202             :                      endif
    1203             :                   enddo
    1204             : 
    1205           0 :                   if (sum_digits < digits(1.0_r8)) then
    1206           0 :                      recompute = .true.
    1207             :                   endif
    1208             :                endif
    1209             :             endif
    1210             : 
    1211             :          endif
    1212             : 
    1213             :       enddo
    1214             : 
    1215             : 
    1216           0 :    end subroutine ice_reprosum_int
    1217             : 
    1218             : !========================================================================
    1219             : !
    1220             : ! Purpose:
    1221             : !> Test whether distributed sum exceeds tolerance and print out a
    1222             : !> warning message.
    1223             : !
    1224             : !----------------------------------------------------------------------
    1225             : 
    1226           0 :    logical function ice_reprosum_tolExceeded (name, nflds, master, &
    1227           0 :                                               logunit, rel_diff    )
    1228             : !----------------------------------------------------------------------
    1229             : 
    1230             : ! Arguments
    1231             : 
    1232             :       character(len=*), intent(in) :: name    !< distributed sum identifier
    1233             :       integer,  intent(in) :: nflds           !< number of fields
    1234             :       logical,  intent(in) :: master          !< process that will write
    1235             :                                               !<  warning messages?
    1236             :       integer, optional, intent(in) :: logunit!< unit warning messages
    1237             :                                               !<  written to
    1238             :       real(r8), intent(in) :: rel_diff(2,nflds)
    1239             :                                               !< relative and absolute
    1240             :                                               !<  differences between fixed
    1241             :                                               !<  and floating point sums
    1242             : 
    1243             : ! Local workspace
    1244             : 
    1245             :       integer  :: llogunit                    ! local log unit
    1246             :       integer  :: ifld                        ! field index
    1247             :       integer  :: exceeds_limit               ! number of fields whose
    1248             :                                               !  sum exceeds tolerance
    1249           0 :       real(r8) :: max_rel_diff                ! maximum relative difference
    1250             :       integer  :: max_rel_diff_idx            ! field index for max. rel. diff.
    1251           0 :       real(r8) :: max_abs_diff                ! maximum absolute difference
    1252             :       integer  :: max_abs_diff_idx            ! field index for max. abs. diff.
    1253             :       character(len=*),parameter :: subname = '(ice_reprosum_tolExceeded)'
    1254             : 
    1255             : !-----------------------------------------------------------------------
    1256             : 
    1257           0 :       ice_reprosum_tolExceeded = .false.
    1258           0 :       if (ice_reprosum_reldiffmax < 0.0_r8) return
    1259             : 
    1260           0 :       if ( present(logunit) ) then
    1261           0 :          llogunit = logunit
    1262             :       else
    1263           0 :          llogunit = nu_diag
    1264             :       endif
    1265             : 
    1266             :       ! check that "fast" reproducible sum is accurate enough.
    1267           0 :       exceeds_limit = 0
    1268           0 :       max_rel_diff = 0.0_r8
    1269           0 :       max_abs_diff = 0.0_r8
    1270           0 :       do ifld=1,nflds
    1271           0 :          if (rel_diff(1,ifld) > ice_reprosum_reldiffmax) then
    1272           0 :             exceeds_limit = exceeds_limit + 1
    1273           0 :             if (rel_diff(1,ifld) > max_rel_diff) then
    1274           0 :                max_rel_diff = rel_diff(1,ifld)
    1275           0 :                max_rel_diff_idx = ifld
    1276             :             endif
    1277           0 :             if (rel_diff(2,ifld) > max_abs_diff) then
    1278           0 :                max_abs_diff = rel_diff(2,ifld)
    1279           0 :                max_abs_diff_idx = ifld
    1280             :             endif
    1281             :          endif
    1282             :       enddo
    1283             : 
    1284           0 :       if (exceeds_limit > 0) then
    1285           0 :          if (master) then
    1286           0 :             write(llogunit,*) subname,trim(name), &
    1287             :                             ': difference in fixed and floating point sums ', &   ! LCOV_EXCL_LINE
    1288             :                             ' exceeds tolerance in ', exceeds_limit, &   ! LCOV_EXCL_LINE
    1289           0 :                             ' fields.'
    1290           0 :             write(llogunit,*) subname,'  Maximum relative diff: (rel)', &
    1291             :                             rel_diff(1,max_rel_diff_idx), ' (abs) ', &   ! LCOV_EXCL_LINE
    1292           0 :                             rel_diff(2,max_rel_diff_idx)
    1293           0 :             write(llogunit,*) subname,'  Maximum absolute diff: (rel)', &
    1294             :                             rel_diff(1,max_abs_diff_idx), ' (abs) ', &   ! LCOV_EXCL_LINE
    1295           0 :                             rel_diff(2,max_abs_diff_idx)
    1296             :          endif
    1297           0 :          ice_reprosum_tolExceeded = .true.
    1298             :        endif
    1299             : 
    1300             : 
    1301           0 :    end function ice_reprosum_tolExceeded
    1302             : 
    1303             : !========================================================================
    1304             : !
    1305             : ! Purpose:
    1306             : !> Compute the global sum of each field in "arr" using the indicated
    1307             : !> communicator with a reproducible yet scalable implementation based
    1308             : !> on He and Ding's implementation of the double-double algorithm.
    1309             : !
    1310             : !----------------------------------------------------------------------
    1311             : 
    1312           0 :    subroutine ice_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands,  &
    1313             :                                   nflds, mpi_comm                       )
    1314             : !----------------------------------------------------------------------
    1315             : 
    1316             : ! Arguments
    1317             : 
    1318             :       integer,  intent(in) :: nsummands  !< number of local summands
    1319             :       integer,  intent(in) :: dsummands  !< declared first dimension
    1320             :       integer,  intent(in) :: nflds      !< number of fields
    1321             :       real(r8), intent(in) :: arr(dsummands,nflds)
    1322             :                                          !< input array
    1323             :       integer,  intent(in) :: mpi_comm   !< MPI subcommunicator
    1324             : 
    1325             :       real(r8), intent(out):: arr_gsum(nflds)
    1326             :                                          !< global sums
    1327             : 
    1328             : 
    1329             : ! Local workspace
    1330             : 
    1331             :       integer :: old_cw                  ! for x86 processors, save
    1332             :                                          !  current arithmetic mode
    1333             :       integer :: ifld, isum              ! loop variables
    1334             :       integer :: ierr                    ! MPI error return
    1335             : 
    1336           0 :       real(r8)    :: e, t1, t2           ! temporaries
    1337           0 :       complex(r8) :: arr_lsum_dd(nflds)  ! local sums (in double-double
    1338             :                                          !  format)
    1339           0 :       complex(r8) :: arr_gsum_dd(nflds)  ! global sums (in double-double
    1340             :                                          !  format)
    1341             : 
    1342             :       integer, save :: mpi_sumdd
    1343             :       logical, save :: first_time = .true.
    1344             :       character(len=*),parameter :: subname = '(ice_reprosum_ddpdd)'
    1345             : 
    1346             : !-----------------------------------------------------------------------
    1347             : 
    1348           0 :       call ice_shr_reprosumx86_fix_start (old_cw)
    1349             : 
    1350           0 :       if (first_time) then
    1351             : #ifdef SERIAL_REMOVE_MPI
    1352             :          mpi_sumdd = 0
    1353             : #else
    1354           0 :          call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
    1355             : #endif
    1356           0 :          first_time = .false.
    1357             :       endif
    1358             : 
    1359           0 :       do ifld=1,nflds
    1360           0 :          arr_lsum_dd(ifld) = (0.0_r8,0.0_r8)
    1361             : 
    1362           0 :          do isum=1,nsummands
    1363             : 
    1364             :             ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s
    1365             :             ! trick.
    1366           0 :             t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
    1367           0 :             e  = t1 - arr(isum,ifld)
    1368           0 :             t2 = ((real(arr_lsum_dd(ifld)) - e) &
    1369             :                   + (arr(isum,ifld) - (t1 - e))) &   ! LCOV_EXCL_LINE
    1370           0 :                  + aimag(arr_lsum_dd(ifld))
    1371             : 
    1372             :             ! The result is t1 + t2, after normalization.
    1373           0 :             arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
    1374             :          enddo
    1375             : 
    1376             :       enddo
    1377             : 
    1378             : #ifdef SERIAL_REMOVE_MPI
    1379             :       arr_gsum_dd = arr_lsum_dd
    1380             : #else
    1381             :       call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
    1382           0 :                           MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
    1383             : #endif
    1384           0 :       do ifld=1,nflds
    1385           0 :          arr_gsum(ifld) = real(arr_gsum_dd(ifld))
    1386             :       enddo
    1387             : 
    1388           0 :       call ice_shr_reprosumx86_fix_end (old_cw)
    1389             : 
    1390           0 :    end subroutine ice_reprosum_ddpdd
    1391             : 
    1392             : !-----------------------------------------------------------------------
    1393             : 
    1394           0 :    subroutine DDPDD (dda, ddb, len, itype)
    1395             : !----------------------------------------------------------------------
    1396             : !
    1397             : ! Purpose:
    1398             : ! Modification of original codes written by David H. Bailey
    1399             : ! This subroutine computes ddb(i) = dda(i)+ddb(i)
    1400             : !
    1401             : !----------------------------------------------------------------------
    1402             : 
    1403             : ! Arguments
    1404             : 
    1405             :       integer, intent(in)        :: len       ! array length
    1406             :       complex(r8), intent(in)    :: dda(len)  ! input
    1407             :       complex(r8), intent(inout) :: ddb(len)  ! result
    1408             :       integer, intent(in)        :: itype     ! unused
    1409             : 
    1410             : ! Local workspace
    1411             : 
    1412           0 :       real(r8) e, t1, t2
    1413             :       integer i
    1414             :       character(len=*),parameter :: subname = '(ice_reprosum_mod:DDPDD)'
    1415             : 
    1416             : !-----------------------------------------------------------------------
    1417             : 
    1418           0 :       do i = 1, len
    1419             : !   Compute dda + ddb using Knuth's trick.
    1420           0 :          t1 = real(dda(i)) + real(ddb(i))
    1421           0 :          e  = t1 - real(dda(i))
    1422           0 :          t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
    1423           0 :               + aimag(dda(i)) + aimag(ddb(i))
    1424             : 
    1425             : !   The result is t1 + t2, after normalization.
    1426           0 :          ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
    1427             :       enddo
    1428             : 
    1429             : 
    1430           0 :    end subroutine DDPDD
    1431             : 
    1432             : !-----------------------------------------------------------------------
    1433             : 
    1434           0 :   subroutine split_indices(total,num_pieces,ibeg,iend)
    1435             : !----------------------------------------------------------------------
    1436             : !
    1437             : ! Purpose:
    1438             : ! Split range into 'num_pieces'
    1439             : !
    1440             : !----------------------------------------------------------------------
    1441             : 
    1442             : ! Arguments
    1443             : 
    1444             :     integer, intent(in) :: total
    1445             :     integer, intent(in) :: num_pieces
    1446             :     integer, intent(out) :: ibeg(num_pieces), iend(num_pieces)
    1447             : 
    1448             : ! Local workspace
    1449             : 
    1450             :     integer :: itmp1, itmp2, ioffset, i
    1451             :     character(len=*),parameter :: subname = '(ice_reprosum_mod:split_indices)'
    1452             : 
    1453             : !-----------------------------------------------------------------------
    1454             : 
    1455           0 :     itmp1 = total/num_pieces
    1456           0 :     itmp2 = mod(total,num_pieces)
    1457           0 :     ioffset = 0
    1458           0 :     do i=1,itmp2
    1459           0 :        ibeg(i) = ioffset + 1
    1460           0 :        iend(i) = ioffset + (itmp1+1)
    1461           0 :        ioffset = iend(i)
    1462             :     enddo
    1463           0 :     do i=itmp2+1,num_pieces
    1464           0 :        ibeg(i) = ioffset + 1
    1465           0 :        if (ibeg(i) > total) then
    1466           0 :           iend(i) = ibeg(i) - 1
    1467             :        else
    1468           0 :           iend(i) = ioffset + itmp1
    1469           0 :           ioffset = iend(i)
    1470             :        endif
    1471             :     enddo
    1472             : 
    1473           0 :   end subroutine split_indices
    1474             : 
    1475             : !========================================================================
    1476             : 
    1477             : end module ice_reprosum

Generated by: LCOV version 1.14-6-g40580cd