LCOV - code coverage report
Current view: top level - cicecore/cicedyn/infrastructure/comm/serial - ice_reprosum.F90 (source / functions) Hit Total Coverage
Test: 231018-211459:8916b9ff2c:1:quick Lines: 0 398 0.00 %
Date: 2023-10-18 15:30:36 Functions: 0 7 0.00 %

          Line data    Source code
       1             : 
       2             : !> Reproducible sum method from P. Worley
       3             : #define SERIAL_REMOVE_MPI
       4             : 
       5             : MODULE ice_reprosum
       6             : 
       7             : !-----------------------------------------------------------------------
       8             : !
       9             : ! Purpose:
      10             : !> Compute reproducible global sums of a set of arrays across an MPI
      11             : !> subcommunicator
      12             : !
      13             : ! Methods:
      14             : !> Compute using either or both a scalable, reproducible algorithm and a
      15             : !> scalable, nonreproducible algorithm:
      16             : !> * Reproducible (scalable):
      17             : !>    Convert to fixed point (integer vector representation) to enable
      18             : !>    reproducibility when using MPI_Allreduce
      19             : !> * Alternative usually reproducible (scalable):
      20             : !>    Use parallel double-double algorithm due to Helen He and
      21             : !>    Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
      22             : !> * Nonreproducible (scalable):
      23             : !>    Floating point and MPI_Allreduce based.
      24             : !> If computing both reproducible and nonreproducible sums, compare
      25             : !> these and report relative difference (if absolute difference
      26             : !> less than sum) or absolute difference back to calling routine.
      27             : !
      28             : ! Author: P. Worley (based on suggestions from J. White for fixed
      29             : !                    point algorithm and on He/Ding paper for ddpdd
      30             : !                    algorithm)
      31             : !
      32             : ! Modified by T.Craig for CICE, March 2019 based on the public version in
      33             : ! Oasis3-MCT_4.0.
      34             : !
      35             : !-----------------------------------------------------------------------
      36             : 
      37             : !-----------------------------------------------------------------------
      38             : !- use statements ------------------------------------------------------
      39             : !-----------------------------------------------------------------------
      40             : #ifndef SERIAL_REMOVE_MPI
      41             : use mpi   ! MPI Fortran module
      42             : #endif
      43             : #if defined (NO_I8)
      44             :    ! Workaround for when shr_kind_i8 is not supported.
      45             :    use ice_kinds_mod, only: r8 => dbl_kind, i8 => int_kind
      46             : #else
      47             :    use ice_kinds_mod, only: r8 => dbl_kind, i8 => int8_kind
      48             : #endif
      49             :    use ice_kinds_mod, only: char_len_long
      50             :    use ice_fileunits, only: nu_diag
      51             :    use ice_exit,  only: abort_ice
      52             : 
      53             : !  internal timers not yet implemented, need to revisit if needed
      54             : !   use ice_mpi,   only: xicex_mpi_barrier
      55             : !   use ice_timer, only: xicex_timer_start, xicex_timer_stop
      56             : 
      57             : !-----------------------------------------------------------------------
      58             : !- module boilerplate --------------------------------------------------
      59             : !-----------------------------------------------------------------------
      60             :    implicit none
      61             :    private
      62             : 
      63             : !-----------------------------------------------------------------------
      64             : ! Public interfaces ----------------------------------------------------
      65             : !-----------------------------------------------------------------------
      66             :    public :: &
      67             :       ice_reprosum_setopts,        &! set runtime options   ! LCOV_EXCL_LINE
      68             :       ice_reprosum_calc,           &! calculate distributed sum   ! LCOV_EXCL_LINE
      69             :       ice_reprosum_tolExceeded      ! utility function to check relative
      70             :                                       !  differences against the tolerance
      71             : 
      72             : !-----------------------------------------------------------------------
      73             : ! Public data ----------------------------------------------------------
      74             : !-----------------------------------------------------------------------
      75             :    logical, public     :: ice_reprosum_recompute = .false.
      76             : 
      77             :    real(r8), public    :: ice_reprosum_reldiffmax = -1.0_r8
      78             : 
      79             : !-----------------------------------------------------------------------
      80             : ! Private interfaces ---------------------------------------------------
      81             : !-----------------------------------------------------------------------
      82             :    private :: &
      83             :       ddpdd,           &! double-double sum routine   ! LCOV_EXCL_LINE
      84             :       split_indices     ! split indices among OMP threads
      85             : 
      86             : !-----------------------------------------------------------------------
      87             : ! Private data ---------------------------------------------------------
      88             : !-----------------------------------------------------------------------
      89             : 
      90             :    logical :: repro_sum_use_ddpdd = .false.
      91             :    logical :: detailed_timing = .false.
      92             :    character(len=char_len_long) :: tmpstr
      93             : 
      94             :    CONTAINS
      95             : 
      96             : !========================================================================
      97             : !-----------------------------------------------------------------------
      98             : ! Purpose:
      99             : !> Set runtime options
     100             : ! Author: P. Worley
     101             : !-----------------------------------------------------------------------
     102             : 
     103           0 :    subroutine ice_reprosum_setopts(repro_sum_use_ddpdd_in,    &
     104             :                                    repro_sum_rel_diff_max_in, &   ! LCOV_EXCL_LINE
     105             :                                    repro_sum_recompute_in,    &   ! LCOV_EXCL_LINE
     106             :                                    repro_sum_master,          &   ! LCOV_EXCL_LINE
     107             :                                    repro_sum_logunit          )
     108             : 
     109             : !------------------------------Arguments--------------------------------
     110             :       logical, intent(in), optional :: repro_sum_use_ddpdd_in
     111             :       !< Use DDPDD algorithm instead of fixed precision algorithm
     112             :       real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
     113             :       !< maximum permissible difference between reproducible and
     114             :       !< nonreproducible sums
     115             :       logical, intent(in), optional  :: repro_sum_recompute_in
     116             :       !< recompute using different algorithm when difference between
     117             :       !< reproducible and nonreproducible sums is too great
     118             :       logical, intent(in), optional  :: repro_sum_master
     119             :       !< flag indicating whether this process should output
     120             :       !< log messages
     121             :       integer, intent(in), optional  :: repro_sum_logunit
     122             :       !< unit number for log messages
     123             : !---------------------------Local Workspace-----------------------------
     124             :       integer llogunit           ! unit number for log messages
     125             :       logical master             ! local master?
     126             :       logical,save :: firstcall = .true.  ! first call
     127             :       character(len=*),parameter :: subname = '(ice_reprosum_setopts)'
     128             : !-----------------------------------------------------------------------
     129             : 
     130           0 :       if ( present(repro_sum_master) ) then
     131           0 :          master = repro_sum_master
     132             :       else
     133           0 :          master = .false.
     134             :       endif
     135             : 
     136           0 :       if ( present(repro_sum_logunit) ) then
     137           0 :          llogunit = repro_sum_logunit
     138             :       else
     139           0 :          llogunit = nu_diag
     140             :       endif
     141             : 
     142           0 :       if (.not. firstcall) then
     143           0 :          write(tmpstr,*) subname//' ERROR: can only be called once'
     144           0 :          call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
     145             :       endif
     146           0 :       firstcall = .false.
     147             : 
     148           0 :       if ( present(repro_sum_use_ddpdd_in) ) then
     149           0 :          repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
     150             :       endif
     151           0 :       if ( present(repro_sum_rel_diff_max_in) ) then
     152           0 :          ice_reprosum_reldiffmax = repro_sum_rel_diff_max_in
     153             :       endif
     154           0 :       if ( present(repro_sum_recompute_in) ) then
     155           0 :          ice_reprosum_recompute = repro_sum_recompute_in
     156             :       endif
     157           0 :       if (master) then
     158           0 :          if ( repro_sum_use_ddpdd ) then
     159           0 :             write(llogunit,*) subname, &
     160             :               'Using double-double-based (scalable) usually reproducible ', &   ! LCOV_EXCL_LINE
     161           0 :               'distributed sum algorithm'
     162             :          else
     163           0 :             write(llogunit,*) subname, &
     164             :               'Using fixed-point-based (scalable) reproducible ', &   ! LCOV_EXCL_LINE
     165           0 :               'distributed sum algorithm'
     166             :          endif
     167             : 
     168           0 :          if (ice_reprosum_reldiffmax >= 0._r8) then
     169           0 :             write(llogunit,*) subname, &
     170             :               ' with a maximum relative error tolerance of ', &   ! LCOV_EXCL_LINE
     171           0 :               ice_reprosum_reldiffmax
     172           0 :             if (ice_reprosum_recompute) then
     173           0 :                write(llogunit,*) subname, &
     174             :                  'If tolerance exceeded, sum is recomputed using ', &   ! LCOV_EXCL_LINE
     175           0 :                  'a serial algorithm.'
     176             :             else
     177           0 :                write(llogunit,*) subname, &
     178             :                  'If tolerance exceeded, fixed-precision is sum used ', &   ! LCOV_EXCL_LINE
     179           0 :                  'but a warning is output.'
     180             :             endif
     181             :          else
     182           0 :             write(llogunit,*) subname, &
     183           0 :               'and not comparing with floating point algorithms.'
     184             :          endif
     185             : 
     186             :       endif
     187           0 :    end subroutine ice_reprosum_setopts
     188             : 
     189             : !========================================================================
     190             : 
     191             : !
     192             : ! Purpose:
     193             : !> Compute the global sum of each field in "arr" using the indicated
     194             : !> communicator with a reproducible yet scalable implementation based
     195             : !> on a fixed point algorithm. An alternative is to use an "almost
     196             : !> always reproducible" floating point algorithm.
     197             : !
     198             : ! The accuracy of the fixed point algorithm is controlled by the
     199             : ! number of "levels" of integer expansion. The algorithm will calculate
     200             : ! the number of levels that is required for the sum to be essentially
     201             : ! exact. The optional parameter arr_max_levels can be used to override
     202             : ! the calculated value. The optional parameter arr_max_levels_out can be
     203             : ! used to return the values used.
     204             : !
     205             : ! The algorithm also requires an upper bound on
     206             : ! the maximum summand (in absolute value) for each field, and will
     207             : ! calculate this internally. However, if the optional parameters
     208             : ! arr_max_levels and arr_gbl_max are both set, then the algorithm will
     209             : ! use the values in arr_gbl_max for the upper bounds instead. If these
     210             : ! are not upper bounds, or if the upper bounds are not tight enough
     211             : ! to achieve the requisite accuracy, and if the optional parameter
     212             : ! repro_sum_validate is NOT set to .false., the algorithm will repeat the
     213             : ! computation with appropriate upper bounds. If only arr_gbl_max is present,
     214             : ! then the maxima are computed internally (and the specified values are
     215             : ! ignored). The optional parameter arr_gbl_max_out can be
     216             : ! used to return the values used.
     217             : !
     218             : ! Finally, the algorithm requires an upper bound on the number of
     219             : ! local summands across all processes. This will be calculated internally,
     220             : ! using an MPI collective, but the value in the optional argument
     221             : ! gbl_max_nsummands will be used instead if (1) it is present, (2)
     222             : ! it is > 0, and (3) the maximum value and required number of levels
     223             : ! are also specified. (If the maximum value is calculated, the same
     224             : ! MPI collective is used to determine the maximum number of local
     225             : ! summands.) The accuracy of the user-specified value is not checked.
     226             : ! However, if set to < 1, the value will instead be calculated. If the
     227             : ! optional parameter gbl_max_nsummands_out is present, then the value
     228             : ! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
     229             : ! returned.
     230             : !
     231             : ! If requested (by setting ice_reprosum_reldiffmax >= 0.0 and passing in
     232             : ! the optional rel_diff parameter), results are compared with a
     233             : ! nonreproducible floating point algorithm.
     234             : !
     235             : ! Note that the cost of the algorithm is not strongly correlated with
     236             : ! the number of levels, which primarily shows up as a (modest) increase
     237             : ! in cost of the MPI_Allreduce as a function of vector length. Rather the
     238             : ! cost is more a function of (a) the number of integers required to
     239             : ! represent an individual summand and (b) the number of MPI_Allreduce
     240             : ! calls. The number of integers required to represent an individual
     241             : ! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
     242             : ! when the number of local summands is not too large. As the number of
     243             : ! local summands increases, the number of integers required increases.
     244             : ! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
     245             : ! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
     246             : ! correctly). When specifying arr_max_levels and arr_gbl_max
     247             : ! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
     248             : !
     249             : ! The alternative algorithm is a minor modification of a parallel
     250             : ! implementation of David Bailey's routine DDPDD by Helen He
     251             : ! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
     252             : ! precision summation of double precision values with 10 double
     253             : ! precision operations. The advantage of this algorithm is that
     254             : ! it requires a single MPI_Allreduce and is less expensive per summand
     255             : ! than is the fixed precision algorithm. The disadvantage is that it
     256             : ! is not guaranteed to be reproducible (though it is reproducible
     257             : ! much more often than is the standard algorithm). This alternative
     258             : ! is used when the optional parameter ddpdd_sum is set to .true. It is
     259             : ! also used if the fixed precision algorithm radix assumption does not
     260             : ! hold.
     261             : 
     262             : !----------------------------------------------------------------------
     263           0 :    subroutine ice_reprosum_calc (arr, arr_gsum, nsummands, dsummands,     &
     264             :                                  nflds, ddpdd_sum,                        &   ! LCOV_EXCL_LINE
     265             :                                  arr_gbl_max, arr_gbl_max_out,            &   ! LCOV_EXCL_LINE
     266             :                                  arr_max_levels, arr_max_levels_out,      &   ! LCOV_EXCL_LINE
     267             :                                  gbl_max_nsummands, gbl_max_nsummands_out,&   ! LCOV_EXCL_LINE
     268             :                                  gbl_count, repro_sum_validate,           &   ! LCOV_EXCL_LINE
     269           0 :                                  repro_sum_stats, rel_diff, commid        )
     270             : !----------------------------------------------------------------------
     271             : 
     272             : ! Arguments
     273             : 
     274             :       integer,  intent(in) :: nsummands  !< number of local summands
     275             :       integer,  intent(in) :: dsummands  !< declared first dimension
     276             :       integer,  intent(in) :: nflds      !< number of fields
     277             :       real(r8), intent(in) :: arr(dsummands,nflds)
     278             :                                          !< input array
     279             : 
     280             :       real(r8), intent(out):: arr_gsum(nflds)
     281             :                                          !< global means
     282             : 
     283             :       logical,  intent(in),    optional :: ddpdd_sum
     284             :                                          !< use ddpdd algorithm instead
     285             :                                          !< of fixed precision algorithm
     286             : 
     287             :       real(r8), intent(in),    optional :: arr_gbl_max(nflds)
     288             :                                          !< upper bound on max(abs(arr))
     289             : 
     290             :       real(r8), intent(out),   optional :: arr_gbl_max_out(nflds)
     291             :                                          !< calculated upper bound on
     292             :                                          !< max(abs(arr))
     293             : 
     294             :       integer,  intent(in),    optional :: arr_max_levels(nflds)
     295             :                                          !< maximum number of levels of
     296             :                                          !< integer expansion to use
     297             : 
     298             :       integer,  intent(out),   optional :: arr_max_levels_out(nflds)
     299             :                                          !< output of number of levels of
     300             :                                          !< integer expansion to used
     301             : 
     302             :       integer,  intent(in),    optional :: gbl_max_nsummands
     303             :                                          !< maximum of nsummand over all
     304             :                                          !< processes
     305             : 
     306             :       integer,  intent(out),   optional :: gbl_max_nsummands_out
     307             :                                          !< calculated maximum nsummands
     308             :                                          !< over all processes
     309             : 
     310             :       integer,  intent(in),    optional :: gbl_count
     311             :                                          !< was total number of summands;
     312             :                                          !< now is ignored; use
     313             :                                          !< gbl_max_nsummands instead
     314             : 
     315             :       logical,  intent(in),    optional :: repro_sum_validate
     316             :          !< flag enabling/disabling testing that gmax and  max_levels are
     317             :          !< accurate/sufficient. Default is enabled.
     318             : 
     319             :       integer,  intent(inout), optional :: repro_sum_stats(5)
     320             :                                    !< increment running totals for
     321             :                                    !<  (1) one-reduction repro_sum
     322             :                                    !<  (2) two-reduction repro_sum
     323             :                                    !<  (3) both types in one call
     324             :                                    !<  (4) nonrepro_sum
     325             :                                    !<  (5) global max nsummands reduction
     326             : 
     327             :       real(r8), intent(out),   optional :: rel_diff(2,nflds)
     328             :                                          !< relative and absolute
     329             :                                          !<  differences between fixed
     330             :                                          !<  and floating point sums
     331             : 
     332             :       integer,  intent(in),    optional :: commid
     333             :                                          !< MPI communicator
     334             : 
     335             : ! Local workspace
     336             : 
     337             :       logical :: use_ddpdd_sum           ! flag indicating whether to
     338             :                                          !  use ice_reprosum_ddpdd or not
     339             :       logical :: recompute               ! flag indicating need to
     340             :                                          !  determine gmax/gmin before
     341             :                                          !  computing sum
     342             :       logical :: validate                ! flag indicating need to
     343             :                                          !  verify gmax and max_levels
     344             :                                          !  are accurate/sufficient
     345             :       integer :: omp_nthreads            ! number of OpenMP threads
     346             :       integer :: mpi_comm                ! MPI subcommunicator
     347             :       integer :: tasks                   ! number of MPI processes
     348             :       integer :: mype                    ! MPI task rank
     349             :       integer :: ierr                    ! MPI error return
     350             :       integer :: ifld, isum, ithread     ! loop variables
     351             :       integer :: max_nsummands           ! max nsummands over all processes
     352             :                                          !  or threads (used in both ways)
     353             : 
     354           0 :       integer, allocatable :: isum_beg(:), isum_end(:)
     355             :                                          ! range of summand indices for each
     356             :                                          !  OpenMP thread
     357           0 :       integer, allocatable :: arr_tlmin_exp(:,:)
     358             :                                          ! per thread local exponent minima
     359           0 :       integer, allocatable :: arr_tlmax_exp(:,:)
     360             :                                          ! per thread local exponent maxima
     361             :       integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
     362             :                                          ! summand exponent and working min/max
     363           0 :       integer :: arr_lmin_exp(nflds)     ! local exponent minima
     364           0 :       integer :: arr_lmax_exp(nflds)     ! local exponent maxima
     365           0 :       integer :: arr_lextremes(0:nflds,2)! local exponent extrema
     366           0 :       integer :: arr_gextremes(0:nflds,2)! global exponent extrema
     367             : 
     368           0 :       integer :: arr_gmax_exp(nflds)     ! global exponents maxima
     369           0 :       integer :: arr_gmin_exp(nflds)     ! global exponents minima
     370             :       integer :: arr_max_shift           ! maximum safe exponent for
     371             :                                          !  value < 1 (so that sum does
     372             :                                          !  not overflow)
     373           0 :       integer :: max_levels(nflds)       ! maximum number of levels of
     374             :                                          !  integer expansion to use
     375             :       integer :: max_level               ! maximum value in max_levels
     376             :       integer :: gbl_max_red             ! global max local sum reduction? (0/1)
     377             :       integer :: repro_sum_fast          ! 1 reduction repro_sum? (0/1)
     378             :       integer :: repro_sum_slow          ! 2 reduction repro_sum? (0/1)
     379             :       integer :: repro_sum_both          ! both fast and slow? (0/1)
     380             :       integer :: nonrepro_sum            ! nonrepro_sum? (0/1)
     381             : 
     382             :       real(r8) :: xmax_nsummands         ! dble of max_nsummands
     383           0 :       real(r8) :: arr_lsum(nflds)        ! local sums
     384           0 :       real(r8) :: arr_gsum_fast(nflds)   ! global sum calculated using
     385             :                                          !  fast, nonreproducible,
     386             :                                          !  floating point alg.
     387             :       real(r8) :: abs_diff               ! absolute difference between
     388             :                                          !  fixed and floating point
     389             :                                          !  sums
     390             : #ifdef _OPENMP
     391             :       integer omp_get_max_threads
     392             :       external omp_get_max_threads
     393             : #endif
     394             :       character(len=*),parameter :: subname = '(ice_reprosum_calc)'
     395             : 
     396             : !-----------------------------------------------------------------------
     397             : 
     398             : ! check whether should use ice_reprosum_ddpdd algorithm
     399           0 :       use_ddpdd_sum = repro_sum_use_ddpdd
     400           0 :       if ( present(ddpdd_sum) ) then
     401           0 :          use_ddpdd_sum = ddpdd_sum
     402             :       endif
     403             : 
     404             : ! check whether intrinsic-based algorithm will work on this system
     405             : ! (requires floating point and integer bases to be the same)
     406             : ! If not, always use ddpdd.
     407           0 :       use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
     408             : 
     409             : ! initialize local statistics variables
     410           0 :       gbl_max_red = 0
     411           0 :       repro_sum_fast = 0
     412           0 :       repro_sum_slow = 0
     413           0 :       repro_sum_both = 0
     414           0 :       nonrepro_sum = 0
     415             : 
     416             : ! set MPI communicator
     417           0 :       if ( present(commid) ) then
     418           0 :          mpi_comm = commid
     419             :       else
     420             : #ifdef SERIAL_REMOVE_MPI
     421           0 :          mpi_comm = 0
     422             : #else
     423             :          mpi_comm = MPI_COMM_WORLD
     424             : #endif
     425             :       endif
     426             : 
     427             : !      if (detailed_timing) then
     428             : !         call xicex_timer_start('xicex_reprosum_prebarrier')
     429             : !         call xicex_mpi_barrier(mpi_comm,subname)
     430             : !         call xicex_timer_stop ('xicex_reprosum_prebarrier')
     431             : !      endif
     432             : 
     433           0 :       if ( use_ddpdd_sum ) then
     434             : 
     435             : !         if (detailed_timing) call xicex_timer_start('ice_reprosum_ddpdd')
     436             : 
     437             :          call ice_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
     438           0 :                                  nflds, mpi_comm)
     439           0 :          repro_sum_fast = 1
     440             : 
     441             : !         if (detailed_timing) call xicex_timer_stop('ice_reprosum_ddpdd')
     442             : 
     443             :       else
     444             : 
     445             : !         if (detailed_timing) call xicex_timer_start('ice_reprosum_int')
     446             : 
     447             : ! get number of MPI tasks
     448             : #ifdef SERIAL_REMOVE_MPI
     449           0 :          tasks = 1
     450           0 :          mype = 0
     451             : #else
     452             :          call mpi_comm_size(mpi_comm, tasks, ierr)
     453             :          call mpi_comm_rank(mpi_comm, mype, ierr)
     454             : #endif
     455             : 
     456             : ! get number of OpenMP threads
     457             : #ifdef _OPENMP
     458             :          omp_nthreads = omp_get_max_threads()
     459             : #else
     460           0 :          omp_nthreads = 1
     461             : #endif
     462             : 
     463             : ! see if have sufficient information to not require max/min allreduce
     464           0 :          recompute = .true.
     465           0 :          validate = .false.
     466           0 :          if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
     467           0 :             recompute = .false.
     468             : 
     469             : ! setting lower bound on max_level*nflds to be 64 to improve OpenMP
     470             : ! performance for loopb in ice_reprosum_int
     471           0 :             max_level = (64/nflds) + 1
     472           0 :             do ifld=1,nflds
     473           0 :                if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
     474           0 :                    (arr_max_levels(ifld) > 0)) then
     475             : 
     476           0 :                   arr_gmax_exp(ifld)  = exponent(arr_gbl_max(ifld))
     477           0 :                   if (max_level < arr_max_levels(ifld)) &
     478           0 :                      max_level = arr_max_levels(ifld)
     479             : 
     480             :                else
     481           0 :                   recompute = .true.
     482             :                endif
     483             :             enddo
     484             : 
     485           0 :             if (.not. recompute) then
     486             : 
     487             : ! determine maximum number of summands in local phases of the
     488             : ! algorithm
     489             : !               if (detailed_timing) call xicex_timer_start("repro_sum_allr_max")
     490           0 :                if ( present(gbl_max_nsummands) ) then
     491           0 :                   if (gbl_max_nsummands < 1) then
     492             : #ifdef SERIAL_REMOVE_MPI
     493           0 :                      max_nsummands = nsummands
     494             : #else
     495             :                      call mpi_allreduce (nsummands, max_nsummands, 1, &
     496             :                                          MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
     497             : #endif
     498           0 :                      gbl_max_red = 1
     499             :                   else
     500           0 :                      max_nsummands = gbl_max_nsummands
     501             :                   endif
     502             :                else
     503             : #ifdef SERIAL_REMOVE_MPI
     504           0 :                   max_nsummands = nsummands
     505             : #else
     506             :                   call mpi_allreduce (nsummands, max_nsummands, 1, &
     507             :                                       MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
     508             : #endif
     509           0 :                   gbl_max_red = 1
     510             :                endif
     511             : !               if (detailed_timing) call xicex_timer_stop("repro_sum_allr_max")
     512             : 
     513             : ! determine maximum shift. Shift needs to be small enough that summation
     514             : !  does not exceed maximum number of digits in i8.
     515             : 
     516             : ! if requested, return max_nsummands before it is redefined
     517           0 :                if ( present( gbl_max_nsummands_out) ) then
     518           0 :                   gbl_max_nsummands_out = max_nsummands
     519             :                endif
     520             : 
     521             : ! summing within each thread first
     522           0 :                max_nsummands = (max_nsummands/omp_nthreads) + 1
     523             : ! then over threads and tasks
     524           0 :                max_nsummands = max(max_nsummands, tasks*omp_nthreads)
     525             : 
     526           0 :                xmax_nsummands = dble(max_nsummands)
     527           0 :                arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
     528           0 :                if (arr_max_shift < 2) then
     529           0 :                   write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
     530           0 :                   call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
     531             :                endif
     532             : 
     533             : ! calculate sum
     534           0 :                if (present(repro_sum_validate)) then
     535           0 :                   validate = repro_sum_validate
     536             :                else
     537           0 :                   validate = .true.
     538             :                endif
     539             :                call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
     540             :                                      nflds, arr_max_shift, arr_gmax_exp, &   ! LCOV_EXCL_LINE
     541             :                                      arr_max_levels, max_level, validate, &   ! LCOV_EXCL_LINE
     542           0 :                                      recompute, omp_nthreads, mpi_comm)
     543             : 
     544             : ! record statistics, etc.
     545           0 :                repro_sum_fast = 1
     546           0 :                if (recompute) then
     547           0 :                   repro_sum_both = 1
     548             :                else
     549             : ! if requested, return specified levels and upper bounds on maxima
     550           0 :                   if ( present(arr_max_levels_out) ) then
     551           0 :                      do ifld=1,nflds
     552           0 :                         arr_max_levels_out(ifld) = arr_max_levels(ifld)
     553             :                      enddo
     554             :                   endif
     555           0 :                   if ( present(arr_gbl_max_out) ) then
     556           0 :                      do ifld=1,nflds
     557           0 :                        arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
     558             :                      enddo
     559             :                   endif
     560             :                endif
     561             :             endif
     562             :          endif
     563             : 
     564             : ! do not have sufficient information; calculate global max/min and
     565             : ! use to compute required number of levels
     566           0 :          if (recompute) then
     567             : 
     568             : ! record statistic
     569           0 :             repro_sum_slow = 1
     570             : 
     571             : ! determine maximum and minimum (non-zero) summand values and
     572             : ! maximum number of local summands
     573             : 
     574             : ! allocate thread-specific work space
     575           0 :             allocate(arr_tlmax_exp(nflds,omp_nthreads))
     576           0 :             allocate(arr_tlmin_exp(nflds,omp_nthreads))
     577           0 :             allocate(isum_beg(omp_nthreads))
     578           0 :             allocate(isum_end(omp_nthreads))
     579             : 
     580             : ! split summand index range over OpenMP threads
     581           0 :             call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
     582             : 
     583             : !$omp parallel do      &
     584             : !$omp default(shared)  &   ! LCOV_EXCL_LINE
     585             : !$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
     586           0 :             do ithread=1,omp_nthreads
     587             : !               if (detailed_timing) call xicex_timer_start('repro_sum_loopa')
     588           0 :                do ifld=1,nflds
     589           0 :                   arr_exp_tlmin = MAXEXPONENT(1._r8)
     590           0 :                   arr_exp_tlmax = MINEXPONENT(1._r8)
     591           0 :                   do isum=isum_beg(ithread),isum_end(ithread)
     592           0 :                      if (arr(isum,ifld) .ne. 0.0_r8) then
     593           0 :                         arr_exp = exponent(arr(isum,ifld))
     594           0 :                         arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
     595           0 :                         arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
     596             :                      endif
     597             :                   end do
     598           0 :                   arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
     599           0 :                   arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
     600             :                end do
     601             : !               if (detailed_timing) call xicex_timer_stop('repro_sum_loopa')
     602             :             end do
     603             : 
     604           0 :             do ifld=1,nflds
     605           0 :                arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
     606           0 :                arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
     607             :             end do
     608           0 :             deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
     609             : 
     610           0 :             arr_lextremes(0,:) = -nsummands
     611           0 :             arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
     612           0 :             arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
     613             : !            if (detailed_timing) call xicex_timer_start("repro_sum_allr_minmax")
     614             : #ifdef SERIAL_REMOVE_MPI
     615           0 :             arr_gextremes = arr_lextremes
     616             : #else
     617             :             call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
     618             :                                 MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
     619             : #endif
     620             : !            if (detailed_timing) call xicex_timer_stop("repro_sum_allr_minmax")
     621           0 :             max_nsummands   = -arr_gextremes(0,1)
     622           0 :             arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
     623           0 :             arr_gmin_exp(:) =  arr_gextremes(1:nflds,2)
     624             : 
     625             : ! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
     626             : !   and arr_gmax_exp still equals MINEXPONENT. In this case, set
     627             : !   arr_gmin_exp = arr_gmax_exp = MINEXPONENT
     628           0 :             do ifld=1,nflds
     629           0 :                arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
     630             :             enddo
     631             : 
     632             : ! if requested, return upper bounds on observed maxima
     633           0 :             if ( present(arr_gbl_max_out) ) then
     634           0 :                do ifld=1,nflds
     635           0 :                   arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
     636             :                enddo
     637             :             endif
     638             : 
     639             : ! if requested, return max_nsummands before it is redefined
     640           0 :             if ( present( gbl_max_nsummands_out) ) then
     641           0 :                gbl_max_nsummands_out = max_nsummands
     642             :             endif
     643             : 
     644             : ! determine maximum shift (same as in previous branch, but with calculated
     645             : !  max_nsummands). Shift needs to be small enough that summation does not
     646             : !  exceed maximum number of digits in i8.
     647             : 
     648             : ! summing within each thread first
     649           0 :             max_nsummands = (max_nsummands/omp_nthreads) + 1
     650             : ! then over threads and tasks
     651           0 :             max_nsummands = max(max_nsummands, tasks*omp_nthreads)
     652             : 
     653           0 :             xmax_nsummands = dble(max_nsummands)
     654           0 :             arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
     655           0 :             if (arr_max_shift < 2) then
     656           0 :                write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
     657           0 :                call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
     658             :             endif
     659             : 
     660             : ! determine maximum number of levels required for each field
     661             : ! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
     662             : ! + 1 because first truncation probably does not involve a maximal shift
     663             : ! + 1 to guarantee that the integer division rounds up (not down)
     664             : ! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
     665             : !  performance for loopb in ice_reprosum_int)
     666           0 :             max_level = (64/nflds) + 1
     667           0 :             do ifld=1,nflds
     668             :                max_levels(ifld) = 2 + &
     669             :                 ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &   ! LCOV_EXCL_LINE
     670           0 :                 / arr_max_shift)
     671           0 :                if ( present(arr_max_levels) .and. (.not. validate) ) then
     672             : ! if validate true, then computation with arr_max_levels failed
     673             : !  previously
     674           0 :                   if ( arr_max_levels(ifld) > 0 ) then
     675             :                      max_levels(ifld) = &
     676           0 :                         min(arr_max_levels(ifld),max_levels(ifld))
     677             :                   endif
     678             :                endif
     679           0 :                if (max_level < max_levels(ifld)) &
     680           0 :                   max_level = max_levels(ifld)
     681             :             enddo
     682             : 
     683             : ! if requested, return calculated levels
     684           0 :             if ( present(arr_max_levels_out) ) then
     685           0 :                do ifld=1,nflds
     686           0 :                   arr_max_levels_out(ifld) = max_levels(ifld)
     687             :                enddo
     688             :             endif
     689             : 
     690             : ! calculate sum
     691           0 :             validate = .false.
     692             :             call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
     693             :                                   arr_max_shift, arr_gmax_exp, max_levels, &   ! LCOV_EXCL_LINE
     694             :                                   max_level, validate, recompute, &   ! LCOV_EXCL_LINE
     695           0 :                                   omp_nthreads, mpi_comm)
     696             : 
     697             :          endif
     698             : 
     699             : !         if (detailed_timing) call xicex_timer_stop('ice_reprosum_int')
     700             : 
     701             :       endif
     702             : 
     703             : ! compare fixed and floating point results
     704           0 :       if ( present(rel_diff) ) then
     705           0 :          if (ice_reprosum_reldiffmax >= 0.0_r8) then
     706             : 
     707             : !            if (detailed_timing) then
     708             : !               call xicex_timer_start('xicex_nonreprosum_prebarrier')
     709             : !               call xicex_mpi_barrier(mpi_comm,subname)
     710             : !               call xicex_timer_stop ('xicex_nonreprosum_prebarrier')
     711             : !            endif
     712             : 
     713             : !            if (detailed_timing) call xicex_timer_start('nonrepro_sum')
     714             : ! record statistic
     715           0 :             nonrepro_sum = 1
     716             : ! compute nonreproducible sum
     717           0 :             arr_lsum(:) = 0._r8
     718             : !$omp parallel do      &
     719             : !$omp default(shared)  &   ! LCOV_EXCL_LINE
     720             : !$omp private(ifld, isum)
     721           0 :             do ifld=1,nflds
     722           0 :                do isum=1,nsummands
     723           0 :                   arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
     724             :                end do
     725             :             end do
     726             : 
     727             : #ifdef SERIAL_REMOVE_MPI
     728           0 :             arr_gsum_fast = arr_lsum
     729             : #else
     730             :             call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
     731             :                                 MPI_REAL8, MPI_SUM, mpi_comm, ierr)
     732             : #endif
     733             : 
     734             : !            if (detailed_timing) call xicex_timer_stop('nonrepro_sum')
     735             : 
     736             : ! determine differences
     737             : !$omp parallel do      &
     738             : !$omp default(shared)  &   ! LCOV_EXCL_LINE
     739             : !$omp private(ifld, abs_diff)
     740           0 :             do ifld=1,nflds
     741           0 :                abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
     742           0 :                if (abs(arr_gsum(ifld)) > abs_diff) then
     743           0 :                   rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
     744             :                else
     745           0 :                   rel_diff(1,ifld) = abs_diff
     746             :                endif
     747           0 :                rel_diff(2,ifld) = abs_diff
     748             :             enddo
     749             :          else
     750           0 :             rel_diff(:,:) = 0.0_r8
     751             :          endif
     752             :       endif
     753             : 
     754             : ! return statistics
     755           0 :       if ( present(repro_sum_stats) ) then
     756           0 :          repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
     757           0 :          repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
     758           0 :          repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
     759           0 :          repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
     760           0 :          repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
     761             :       endif
     762             : 
     763             : 
     764           0 :    end subroutine ice_reprosum_calc
     765             : 
     766             : !========================================================================
     767             : !
     768             : ! Purpose:
     769             : !> Compute the global sum of each field in "arr" using the indicated
     770             : !> communicator with a reproducible yet scalable implementation based
     771             : !> on a fixed point algorithm. The accuracy of the fixed point algorithm
     772             : !> is controlled by the number of "levels" of integer expansion, the
     773             : !> maximum value of which is specified by max_level.
     774             : !
     775             : !----------------------------------------------------------------------
     776             : 
     777           0 :    subroutine ice_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
     778             :                                 arr_max_shift, arr_gmax_exp, max_levels,    &   ! LCOV_EXCL_LINE
     779             :                                 max_level, validate, recompute,             &   ! LCOV_EXCL_LINE
     780             :                                 omp_nthreads, mpi_comm                      )
     781             : 
     782             : !----------------------------------------------------------------------
     783             : 
     784             : ! Arguments
     785             : 
     786             :       integer,  intent(in) :: nsummands     !< number of local summands
     787             :       integer,  intent(in) :: dsummands     !< declared first dimension
     788             :       integer,  intent(in) :: nflds         !< number of fields
     789             :       integer,  intent(in) :: arr_max_shift !< maximum safe exponent for
     790             :                                             !<  value < 1 (so that sum
     791             :                                             !<  does not overflow)
     792             :       integer,  intent(in) :: arr_gmax_exp(nflds)
     793             :                                             !< exponents of global maxima
     794             :       integer,  intent(in) :: max_levels(nflds)
     795             :                                             !< maximum number of levels
     796             :                                             !<  of integer expansion
     797             :       integer,  intent(in) :: max_level     !< maximum value in
     798             :                                             !<  max_levels
     799             :       integer,  intent(in) :: omp_nthreads  !< number of OpenMP threads
     800             :       integer,  intent(in) :: mpi_comm      !< MPI subcommunicator
     801             : 
     802             :       real(r8), intent(in) :: arr(dsummands,nflds)
     803             :                                              !< input array
     804             : 
     805             :       logical,  intent(in):: validate
     806             :          !< flag indicating that accuracy of solution generated from
     807             :          !< arr_gmax_exp and max_levels should be tested
     808             : 
     809             :       logical,  intent(out):: recompute
     810             :          !< flag indicating that either the upper bounds are inaccurate,
     811             :          !<  or max_levels and arr_gmax_exp do not generate accurate
     812             :          !<  enough sums
     813             : 
     814             :       real(r8), intent(out):: arr_gsum(nflds)      !< global means
     815             : 
     816             : ! Local workspace
     817             : 
     818             :       integer, parameter  :: max_jlevel = &
     819             :                                 1 + (digits(0_i8)/digits(0.0_r8))
     820             : 
     821           0 :       integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
     822             :                                    ! integer vector representing local
     823             :                                    !  sum (per thread, per field)
     824           0 :       integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
     825             :                                    ! integer vector representing local
     826             :                                    !  sum
     827             :       integer(i8) :: i8_arr_level  ! integer part of summand for current
     828             :                                    !  expansion level
     829           0 :       integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
     830             :                                    ! integer vector representing global
     831             :                                    !  sum
     832             :       integer(i8) :: IX_8          ! integer representation of current
     833             :                                    !  jlevels of X_8 ('part' of
     834             :                                    !  i8_arr_gsum_level)
     835             :       integer(i8) :: i8_sign       ! sign global sum
     836             :       integer(i8) :: i8_radix      ! radix for i8 variables
     837             : 
     838           0 :       integer :: max_error(nflds,omp_nthreads)
     839             :                                    ! accurate upper bound on data?
     840           0 :       integer :: not_exact(nflds,omp_nthreads)
     841             :                                    ! max_levels sufficient to
     842             :                                    !  capture all digits?
     843           0 :       integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
     844             :                                    ! range of summand indices for each
     845             :                                    !  OpenMP thread
     846             :       integer :: ifld, isum, ithread
     847             :                                    ! loop variables
     848             :       integer :: arr_exp           ! exponent of summand
     849             :       integer :: arr_shift         ! exponent used to generate integer
     850             :                                    !  for current expansion level
     851             :       integer :: ilevel            ! current integer expansion level
     852           0 :       integer :: offset(nflds)     ! beginning location in
     853             :                                    !  i8_arr_{g,l}sum_level for integer
     854             :                                    !  expansion of current ifld
     855             :       integer :: voffset           ! modification to offset used to
     856             :                                    !  include validation metrics
     857             :       integer :: ioffset           ! offset(ifld)
     858             :       integer :: jlevel            ! number of floating point 'pieces'
     859             :                                    !  extracted from a given i8 integer
     860             :       integer :: ierr              ! MPI error return
     861             :       integer :: LX(max_jlevel)    ! exponent of X_8 (see below)
     862             :       integer :: veclth            ! total length of i8_arr_lsum_level
     863             :       integer :: sum_digits        ! lower bound on number of significant
     864             :                                    !  in integer expansion of sum
     865             :       integer :: curr_exp          ! exponent of partial sum during
     866             :                                    !  reconstruction from integer vector
     867             :       integer :: corr_exp          ! exponent of current summand in
     868             :                                    !  reconstruction from integer vector
     869             : 
     870             :       real(r8) :: arr_frac         ! fraction of summand
     871             :       real(r8) :: arr_remainder    ! part of summand remaining after
     872             :                                    !  current level of integer expansion
     873             :       real(r8) :: X_8(max_jlevel)  ! r8 vector representation of current
     874             :                                    !  i8_arr_gsum_level
     875             :       real(r8) :: RX_8             ! r8 representation of difference
     876             :                                    !  between current i8_arr_gsum_level
     877             :                                    !  and current jlevels of X_8
     878             :                                    !  (== IX_8). Also used in final
     879             :                                    !  scaling step
     880             : 
     881             :       logical :: first             ! flag used to indicate that just
     882             :                                    !  beginning reconstruction of sum
     883             :                                    !  from integer vector
     884             : 
     885             :       character(len=*),parameter :: subname = '(ice_reprosum_int)'
     886             : 
     887             : !-----------------------------------------------------------------------
     888             : ! Save radix of i8 variables in an i8 variable
     889           0 :       i8_radix = radix(IX_8)
     890             : 
     891             : ! If validating upper bounds, reserve space for validation metrics
     892             : ! In both cases, reserve an extra level for overflow from the top level
     893           0 :       if (validate) then
     894           0 :         voffset = 3
     895             :       else
     896           0 :         voffset = 1
     897             :       endif
     898             : 
     899             : ! compute offsets for each field
     900           0 :       offset(1) = voffset
     901           0 :       do ifld=2,nflds
     902             :          offset(ifld) = offset(ifld-1) &
     903           0 :                         + (max_levels(ifld-1) + voffset)
     904             :       enddo
     905           0 :       veclth = offset(nflds) + max_levels(nflds)
     906             : 
     907             : ! split summand index range over OpenMP threads
     908           0 :       call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
     909             : 
     910             : ! convert local summands to vector of integers and sum
     911             : ! (Using scale instead of set_exponent because arr_remainder may not be
     912             : !  "normal" after level 1 calculation)
     913           0 :       i8_arr_lsum_level(:) = 0_i8
     914             : 
     915             : !$omp parallel do      &
     916             : !$omp default(shared)  &   ! LCOV_EXCL_LINE
     917             : !$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, &   ! LCOV_EXCL_LINE
     918             : !$omp         arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
     919           0 :       do ithread=1,omp_nthreads
     920             : !       if (detailed_timing) call xicex_timer_start('repro_sum_loopb')
     921           0 :        do ifld=1,nflds
     922           0 :           ioffset = offset(ifld)
     923             : 
     924           0 :           max_error(ifld,ithread) = 0
     925           0 :           not_exact(ifld,ithread) = 0
     926             : 
     927           0 :           i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
     928           0 :           do isum=isum_beg(ithread),isum_end(ithread)
     929           0 :             arr_remainder = 0.0_r8
     930             : 
     931           0 :             if (arr(isum,ifld) .ne. 0.0_r8) then
     932           0 :                arr_exp   = exponent(arr(isum,ifld))
     933           0 :                arr_frac  = fraction(arr(isum,ifld))
     934             : 
     935             : ! test that global maximum upper bound is an upper bound
     936           0 :                if (arr_exp > arr_gmax_exp(ifld)) then
     937           0 :                   max_error(ifld,ithread) = 1
     938           0 :                   exit
     939             :                endif
     940             : 
     941             : ! calculate first shift
     942           0 :                arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
     943             : 
     944             : ! determine first (probably) nonzero level (assuming initial fraction is
     945             : !  'normal' - algorithm still works if this is not true)
     946             : !  NOTE: this is critical; scale will set to zero if min exponent is too small.
     947           0 :                if (arr_shift < 1) then
     948           0 :                   ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
     949           0 :                   arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
     950             : 
     951           0 :                   do while (arr_shift < 1)
     952           0 :                      arr_shift = arr_shift + arr_max_shift
     953           0 :                      ilevel = ilevel + 1
     954             :                   enddo
     955             :                else
     956           0 :                   ilevel = 1
     957             :                endif
     958             : 
     959           0 :                if (ilevel .le. max_levels(ifld)) then
     960             : ! apply first shift/truncate, add it to the relevant running
     961             : ! sum, and calculate the remainder.
     962           0 :                   arr_remainder = scale(arr_frac,arr_shift)
     963           0 :                   i8_arr_level = int(arr_remainder,i8)
     964             :                   i8_arr_tlsum_level(ilevel,ifld,ithread) = &
     965           0 :                      i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
     966           0 :                   arr_remainder = arr_remainder - i8_arr_level
     967             : 
     968             : ! while the remainder is non-zero, continue to shift, truncate,
     969             : ! sum, and calculate new remainder
     970             :                   do while ((arr_remainder .ne. 0.0_r8) &
     971           0 :                      .and. (ilevel < max_levels(ifld)))
     972           0 :                      ilevel = ilevel + 1
     973           0 :                      arr_remainder = scale(arr_remainder,arr_max_shift)
     974           0 :                      i8_arr_level = int(arr_remainder,i8)
     975             :                      i8_arr_tlsum_level(ilevel,ifld,ithread) = &
     976           0 :                         i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
     977           0 :                      arr_remainder = arr_remainder - i8_arr_level
     978             :                   enddo
     979             : 
     980             :                endif
     981             :             endif
     982             : 
     983           0 :             if (arr_remainder .ne. 0.0_r8) then
     984           0 :                not_exact(ifld,ithread) = 1
     985             :             endif
     986             : 
     987             :           enddo
     988             : 
     989             : ! postprocess integer vector to eliminate potential for overlap in the following
     990             : ! sums over threads and processes: if value larger than or equal to
     991             : ! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
     992             : ! vector, resulting in nonoverlapping ranges for each component. Note that
     993             : ! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
     994             : ! over threads and processes do not overflow for ilevel==1.
     995           0 :           do ilevel=max_levels(ifld),1,-1
     996           0 :              RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
     997           0 :              IX_8 = int(scale(RX_8,-arr_max_shift),i8)
     998           0 :              if (IX_8 .ne. 0_i8) then
     999             :                 i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
    1000           0 :                    i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
    1001           0 :                 IX_8 = IX_8*(i8_radix**arr_max_shift)
    1002             :                 i8_arr_tlsum_level(ilevel,ifld,ithread)   = &
    1003           0 :                    i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
    1004             :              endif
    1005             :           enddo
    1006             :        enddo
    1007             : !       if (detailed_timing) call xicex_timer_stop('repro_sum_loopb')
    1008             :       enddo
    1009             : 
    1010             : ! sum contributions from different threads
    1011           0 :       do ifld=1,nflds
    1012           0 :          ioffset = offset(ifld)
    1013           0 :          do ithread = 1,omp_nthreads
    1014           0 :             do ilevel = 0,max_levels(ifld)
    1015             :                i8_arr_lsum_level(ioffset+ilevel) = &
    1016             :                   i8_arr_lsum_level(ioffset+ilevel) &   ! LCOV_EXCL_LINE
    1017           0 :                   + i8_arr_tlsum_level(ilevel,ifld,ithread)
    1018             :             enddo
    1019             :          enddo
    1020             :       enddo
    1021             : 
    1022             : ! record if upper bound was inaccurate or if level expansion stopped
    1023             : ! before full accuracy was achieved
    1024           0 :       if (validate) then
    1025           0 :          do ifld=1,nflds
    1026           0 :             ioffset = offset(ifld)
    1027           0 :             i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:))
    1028           0 :             i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:))
    1029             :          enddo
    1030             :       endif
    1031             : 
    1032             : ! sum integer vector element-wise
    1033             : #ifdef SERIAL_REMOVE_MPI
    1034           0 :       i8_arr_gsum_level = i8_arr_lsum_level
    1035             : #else
    1036             : #if defined (NO_I8)
    1037             :      ! Workaround for when i8 is not supported.
    1038             : !      if (detailed_timing) call xicex_timer_start("repro_sum_allr_i4")
    1039             :       call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
    1040             :                           veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
    1041             : !      if (detailed_timing) call xicex_timer_stop("repro_sum_allr_i4")
    1042             : #else
    1043             : !      if (detailed_timing) call xicex_timer_start("repro_sum_allr_i8")
    1044             :       call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
    1045             :                           veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
    1046             : !      if (detailed_timing) call xicex_timer_stop("repro_sum_allr_i8")
    1047             : #endif
    1048             : #endif
    1049             : 
    1050             : ! Construct global sum from integer vector representation:
    1051             : !  1) arr_max_shift is the shift applied to fraction(arr_gmax) .
    1052             : !   When shifting back, need to "add back in" true arr_gmax exponent. This was
    1053             : !   removed implicitly by working only with the fraction .
    1054             : !  2) want to add levels into sum in reverse order (smallest to largest). However,
    1055             : !   even this can generate floating point rounding errors if signs of integers
    1056             : !   alternate. To avoid this, do some arithmetic with integer vectors so that all
    1057             : !   components have the same sign. This should keep relative difference between
    1058             : !   using different integer sizes (e.g. i8 and i4) to machine epsilon
    1059             : !  3) assignment to X_8 will usually lose accuracy since maximum integer
    1060             : !   size is greater than the max number of 'digits' in r8 value (if xmax_nsummands
    1061             : !   correction not very large). Calculate remainder and add in first (since
    1062             : !   smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits).
    1063             : !   For r4 (24 digits) may need to correct twice. Code is written in a general
    1064             : !   fashion, to work no matter how many corrections are necessary (assuming
    1065             : !   max_jlevel parameter calculation is correct).
    1066             : 
    1067           0 :       recompute = .false.
    1068           0 :       do ifld=1,nflds
    1069           0 :          arr_gsum(ifld) = 0.0_r8
    1070           0 :          ioffset = offset(ifld)
    1071             : 
    1072             : ! if validate is .true., test whether the summand upper bound
    1073             : !  was exceeded on any of the processes
    1074           0 :          if (validate) then
    1075           0 :             if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then
    1076           0 :                recompute = .true.
    1077             :             endif
    1078             :          endif
    1079             : 
    1080           0 :          if (.not. recompute) then
    1081             : 
    1082             : ! preprocess integer vector:
    1083             : !  a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap'
    1084             : !     to next larger integer in vector, resulting in nonoverlapping ranges for each
    1085             : !     component. Note that have "ilevel-1=0" level here as described above.
    1086           0 :            do ilevel=max_levels(ifld),1,-1
    1087           0 :              RX_8 = i8_arr_gsum_level(ioffset+ilevel)
    1088           0 :              IX_8 = int(scale(RX_8,-arr_max_shift),i8)
    1089           0 :              if (IX_8 .ne. 0_i8) then
    1090             :                i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) &
    1091           0 :                                                    + IX_8
    1092           0 :                IX_8 = IX_8*(i8_radix**arr_max_shift)
    1093             :                i8_arr_gsum_level(ioffset+ilevel)   = i8_arr_gsum_level(ioffset+ilevel)   &
    1094           0 :                                                    - IX_8
    1095             :              endif
    1096             :            enddo
    1097             : !  b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary
    1098             : !     so that all vector components have the same sign (eliminating loss
    1099             : !     of accuracy arising from difference of large values when
    1100             : !     reconstructing r8 sum from integer vector)
    1101           0 :            ilevel = 0
    1102             :            do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) &
    1103           0 :                      .and. (ilevel < max_levels(ifld)))
    1104           0 :              ilevel = ilevel + 1
    1105             :            enddo
    1106             : !
    1107           0 :            if (ilevel < max_levels(ifld)) then
    1108           0 :               if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then
    1109           0 :                  i8_sign = 1_i8
    1110             :               else
    1111           0 :                  i8_sign = -1_i8
    1112             :               endif
    1113           0 :               do jlevel=ilevel,max_levels(ifld)-1
    1114           0 :                  if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) &
    1115           0 :                      .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then
    1116             :                     i8_arr_gsum_level(ioffset+jlevel)   = i8_arr_gsum_level(ioffset+jlevel) &
    1117           0 :                                                         - i8_sign
    1118             :                     i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) &
    1119           0 :                                                         + i8_sign*(i8_radix**arr_max_shift)
    1120             :                  endif
    1121             :               enddo
    1122             :             endif
    1123             : 
    1124             : ! start with maximum shift, and work up to larger values
    1125             :             arr_shift = arr_gmax_exp(ifld) &
    1126           0 :                         - max_levels(ifld)*arr_max_shift
    1127           0 :             curr_exp = 0
    1128           0 :             first = .true.
    1129           0 :             do ilevel=max_levels(ifld),0,-1
    1130             : 
    1131           0 :                if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
    1132           0 :                   jlevel = 1
    1133             : 
    1134             : ! r8 representation of higher order bits in integer
    1135           0 :                   X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel)
    1136           0 :                   LX(jlevel)  = exponent(X_8(jlevel))
    1137             : 
    1138             : ! calculate remainder
    1139           0 :                   IX_8 = int(X_8(jlevel),i8)
    1140           0 :                   RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
    1141             : 
    1142             : ! repeat using remainder
    1143           0 :                   do while (RX_8 .ne. 0.0_r8)
    1144           0 :                      jlevel = jlevel + 1
    1145           0 :                      X_8(jlevel) = RX_8
    1146           0 :                      LX(jlevel) = exponent(RX_8)
    1147           0 :                      IX_8 = IX_8 + int(RX_8,i8)
    1148           0 :                      RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
    1149             :                   enddo
    1150             : 
    1151             : ! add in contributions, smaller to larger, rescaling for each
    1152             : ! addition to guarantee that exponent of working summand is always
    1153             : ! larger than minexponent
    1154           0 :                   do while (jlevel > 0)
    1155           0 :                      if (first) then
    1156           0 :                         curr_exp = LX(jlevel) + arr_shift
    1157           0 :                         arr_gsum(ifld) = fraction(X_8(jlevel))
    1158           0 :                         first = .false.
    1159             :                      else
    1160           0 :                         corr_exp = curr_exp - (LX(jlevel) + arr_shift)
    1161             :                         arr_gsum(ifld) = fraction(X_8(jlevel)) &
    1162           0 :                                        + scale(arr_gsum(ifld),corr_exp)
    1163           0 :                         curr_exp = LX(jlevel) + arr_shift
    1164             :                      endif
    1165           0 :                      jlevel = jlevel - 1
    1166             :                   enddo
    1167             : 
    1168             :                endif
    1169             : 
    1170           0 :                arr_shift = arr_shift + arr_max_shift
    1171             :             enddo
    1172             : 
    1173             : ! apply final exponent correction, scaling first if exponent is too small
    1174             : ! to apply directly
    1175           0 :             corr_exp = curr_exp + exponent(arr_gsum(ifld))
    1176           0 :             if (corr_exp .ge. MINEXPONENT(1._r8)) then
    1177           0 :                arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp)
    1178             :             else
    1179             :                RX_8 = set_exponent(arr_gsum(ifld), &
    1180           0 :                                    corr_exp-MINEXPONENT(1._r8))
    1181           0 :                arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
    1182             :             endif
    1183             : 
    1184             : ! if validate is .true. and some precision lost, test whether 'too much'
    1185             : !  was lost, due to too loose an upper bound, too stringent a limit on number
    1186             : !  of levels of expansion, cancellation, .... Calculated by comparing lower
    1187             : !  bound on number of sigificant digits with number of digits in 1.0_r8 .
    1188           0 :             if (validate) then
    1189           0 :                if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then
    1190             : 
    1191             : ! find first nonzero level and use exponent for this level, then assume all
    1192             : ! subsequent levels contribute arr_max_shift digits.
    1193           0 :                   sum_digits = 0
    1194           0 :                   do ilevel=0,max_levels(ifld)
    1195           0 :                      if (sum_digits .eq. 0) then
    1196           0 :                         if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
    1197           0 :                            X_8(1) = i8_arr_gsum_level(ioffset+ilevel)
    1198           0 :                            LX(1)  = exponent(X_8(1))
    1199           0 :                            sum_digits = LX(1)
    1200             :                         endif
    1201             :                      else
    1202           0 :                         sum_digits = sum_digits + arr_max_shift
    1203             :                      endif
    1204             :                   enddo
    1205             : 
    1206           0 :                   if (sum_digits < digits(1.0_r8)) then
    1207           0 :                      recompute = .true.
    1208             :                   endif
    1209             :                endif
    1210             :             endif
    1211             : 
    1212             :          endif
    1213             : 
    1214             :       enddo
    1215             : 
    1216             : 
    1217           0 :    end subroutine ice_reprosum_int
    1218             : 
    1219             : !========================================================================
    1220             : !
    1221             : ! Purpose:
    1222             : !> Test whether distributed sum exceeds tolerance and print out a
    1223             : !> warning message.
    1224             : !
    1225             : !----------------------------------------------------------------------
    1226             : 
    1227           0 :    logical function ice_reprosum_tolExceeded (name, nflds, master, &
    1228           0 :                                               logunit, rel_diff    )
    1229             : !----------------------------------------------------------------------
    1230             : 
    1231             : ! Arguments
    1232             : 
    1233             :       character(len=*), intent(in) :: name    !< distributed sum identifier
    1234             :       integer,  intent(in) :: nflds           !< number of fields
    1235             :       logical,  intent(in) :: master          !< process that will write
    1236             :                                               !<  warning messages?
    1237             :       integer, optional, intent(in) :: logunit!< unit warning messages
    1238             :                                               !<  written to
    1239             :       real(r8), intent(in) :: rel_diff(2,nflds)
    1240             :                                               !< relative and absolute
    1241             :                                               !<  differences between fixed
    1242             :                                               !<  and floating point sums
    1243             : 
    1244             : ! Local workspace
    1245             : 
    1246             :       integer  :: llogunit                    ! local log unit
    1247             :       integer  :: ifld                        ! field index
    1248             :       integer  :: exceeds_limit               ! number of fields whose
    1249             :                                               !  sum exceeds tolerance
    1250             :       real(r8) :: max_rel_diff                ! maximum relative difference
    1251             :       integer  :: max_rel_diff_idx            ! field index for max. rel. diff.
    1252             :       real(r8) :: max_abs_diff                ! maximum absolute difference
    1253             :       integer  :: max_abs_diff_idx            ! field index for max. abs. diff.
    1254             :       character(len=*),parameter :: subname = '(ice_reprosum_tolExceeded)'
    1255             : 
    1256             : !-----------------------------------------------------------------------
    1257             : 
    1258           0 :       ice_reprosum_tolExceeded = .false.
    1259           0 :       if (ice_reprosum_reldiffmax < 0.0_r8) return
    1260             : 
    1261           0 :       if ( present(logunit) ) then
    1262           0 :          llogunit = logunit
    1263             :       else
    1264           0 :          llogunit = nu_diag
    1265             :       endif
    1266             : 
    1267             :       ! check that "fast" reproducible sum is accurate enough.
    1268           0 :       exceeds_limit = 0
    1269           0 :       max_rel_diff = 0.0_r8
    1270           0 :       max_abs_diff = 0.0_r8
    1271           0 :       do ifld=1,nflds
    1272           0 :          if (rel_diff(1,ifld) > ice_reprosum_reldiffmax) then
    1273           0 :             exceeds_limit = exceeds_limit + 1
    1274           0 :             if (rel_diff(1,ifld) > max_rel_diff) then
    1275           0 :                max_rel_diff = rel_diff(1,ifld)
    1276           0 :                max_rel_diff_idx = ifld
    1277             :             endif
    1278           0 :             if (rel_diff(2,ifld) > max_abs_diff) then
    1279           0 :                max_abs_diff = rel_diff(2,ifld)
    1280           0 :                max_abs_diff_idx = ifld
    1281             :             endif
    1282             :          endif
    1283             :       enddo
    1284             : 
    1285           0 :       if (exceeds_limit > 0) then
    1286           0 :          if (master) then
    1287           0 :             write(llogunit,*) subname,trim(name), &
    1288             :                             ': difference in fixed and floating point sums ', &   ! LCOV_EXCL_LINE
    1289             :                             ' exceeds tolerance in ', exceeds_limit, &   ! LCOV_EXCL_LINE
    1290           0 :                             ' fields.'
    1291           0 :             write(llogunit,*) subname,'  Maximum relative diff: (rel)', &
    1292             :                             rel_diff(1,max_rel_diff_idx), ' (abs) ', &   ! LCOV_EXCL_LINE
    1293           0 :                             rel_diff(2,max_rel_diff_idx)
    1294           0 :             write(llogunit,*) subname,'  Maximum absolute diff: (rel)', &
    1295             :                             rel_diff(1,max_abs_diff_idx), ' (abs) ', &   ! LCOV_EXCL_LINE
    1296           0 :                             rel_diff(2,max_abs_diff_idx)
    1297             :          endif
    1298           0 :          ice_reprosum_tolExceeded = .true.
    1299             :        endif
    1300             : 
    1301             : 
    1302           0 :    end function ice_reprosum_tolExceeded
    1303             : 
    1304             : !========================================================================
    1305             : !
    1306             : ! Purpose:
    1307             : !> Compute the global sum of each field in "arr" using the indicated
    1308             : !> communicator with a reproducible yet scalable implementation based
    1309             : !> on He and Ding's implementation of the double-double algorithm.
    1310             : !
    1311             : !----------------------------------------------------------------------
    1312             : 
    1313           0 :    subroutine ice_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands,  &
    1314             :                                   nflds, mpi_comm                       )
    1315             : !----------------------------------------------------------------------
    1316             : 
    1317             : ! Arguments
    1318             : 
    1319             :       integer,  intent(in) :: nsummands  !< number of local summands
    1320             :       integer,  intent(in) :: dsummands  !< declared first dimension
    1321             :       integer,  intent(in) :: nflds      !< number of fields
    1322             :       real(r8), intent(in) :: arr(dsummands,nflds)
    1323             :                                          !< input array
    1324             :       integer,  intent(in) :: mpi_comm   !< MPI subcommunicator
    1325             : 
    1326             :       real(r8), intent(out):: arr_gsum(nflds)
    1327             :                                          !< global sums
    1328             : 
    1329             : 
    1330             : ! Local workspace
    1331             : 
    1332             :       integer :: old_cw                  ! for x86 processors, save
    1333             :                                          !  current arithmetic mode
    1334             :       integer :: ifld, isum              ! loop variables
    1335             :       integer :: ierr                    ! MPI error return
    1336             : 
    1337             :       real(r8)    :: e, t1, t2           ! temporaries
    1338           0 :       complex(r8) :: arr_lsum_dd(nflds)  ! local sums (in double-double
    1339             :                                          !  format)
    1340           0 :       complex(r8) :: arr_gsum_dd(nflds)  ! global sums (in double-double
    1341             :                                          !  format)
    1342             : 
    1343             :       integer, save :: mpi_sumdd
    1344             :       logical, save :: first_time = .true.
    1345             :       character(len=*),parameter :: subname = '(ice_reprosum_ddpdd)'
    1346             : 
    1347             : !-----------------------------------------------------------------------
    1348             : 
    1349           0 :       call ice_shr_reprosumx86_fix_start (old_cw)
    1350             : 
    1351           0 :       if (first_time) then
    1352             : #ifdef SERIAL_REMOVE_MPI
    1353           0 :          mpi_sumdd = 0
    1354             : #else
    1355             :          call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
    1356             : #endif
    1357           0 :          first_time = .false.
    1358             :       endif
    1359             : 
    1360           0 :       do ifld=1,nflds
    1361           0 :          arr_lsum_dd(ifld) = (0.0_r8,0.0_r8)
    1362             : 
    1363           0 :          do isum=1,nsummands
    1364             : 
    1365             :             ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s
    1366             :             ! trick.
    1367           0 :             t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
    1368           0 :             e  = t1 - arr(isum,ifld)
    1369             :             t2 = ((real(arr_lsum_dd(ifld)) - e) &
    1370             :                   + (arr(isum,ifld) - (t1 - e))) &   ! LCOV_EXCL_LINE
    1371           0 :                  + aimag(arr_lsum_dd(ifld))
    1372             : 
    1373             :             ! The result is t1 + t2, after normalization.
    1374           0 :             arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
    1375             :          enddo
    1376             : 
    1377             :       enddo
    1378             : 
    1379             : #ifdef SERIAL_REMOVE_MPI
    1380           0 :       arr_gsum_dd = arr_lsum_dd
    1381             : #else
    1382             :       call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
    1383             :                           MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
    1384             : #endif
    1385           0 :       do ifld=1,nflds
    1386           0 :          arr_gsum(ifld) = real(arr_gsum_dd(ifld))
    1387             :       enddo
    1388             : 
    1389           0 :       call ice_shr_reprosumx86_fix_end (old_cw)
    1390             : 
    1391           0 :    end subroutine ice_reprosum_ddpdd
    1392             : 
    1393             : !-----------------------------------------------------------------------
    1394             : 
    1395           0 :    subroutine DDPDD (dda, ddb, len, itype)
    1396             : !----------------------------------------------------------------------
    1397             : !
    1398             : ! Purpose:
    1399             : ! Modification of original codes written by David H. Bailey
    1400             : ! This subroutine computes ddb(i) = dda(i)+ddb(i)
    1401             : !
    1402             : !----------------------------------------------------------------------
    1403             : 
    1404             : ! Arguments
    1405             : 
    1406             :       integer, intent(in)        :: len       ! array length
    1407             :       complex(r8), intent(in)    :: dda(len)  ! input
    1408             :       complex(r8), intent(inout) :: ddb(len)  ! result
    1409             :       integer, intent(in)        :: itype     ! unused
    1410             : 
    1411             : ! Local workspace
    1412             : 
    1413             :       real(r8) e, t1, t2
    1414             :       integer i
    1415             :       character(len=*),parameter :: subname = '(ice_reprosum_mod:DDPDD)'
    1416             : 
    1417             : !-----------------------------------------------------------------------
    1418             : 
    1419           0 :       do i = 1, len
    1420             : !   Compute dda + ddb using Knuth's trick.
    1421           0 :          t1 = real(dda(i)) + real(ddb(i))
    1422           0 :          e  = t1 - real(dda(i))
    1423             :          t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
    1424           0 :               + aimag(dda(i)) + aimag(ddb(i))
    1425             : 
    1426             : !   The result is t1 + t2, after normalization.
    1427           0 :          ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
    1428             :       enddo
    1429             : 
    1430             : 
    1431           0 :    end subroutine DDPDD
    1432             : 
    1433             : !-----------------------------------------------------------------------
    1434             : 
    1435           0 :   subroutine split_indices(total,num_pieces,ibeg,iend)
    1436             : !----------------------------------------------------------------------
    1437             : !
    1438             : ! Purpose:
    1439             : ! Split range into 'num_pieces'
    1440             : !
    1441             : !----------------------------------------------------------------------
    1442             : 
    1443             : ! Arguments
    1444             : 
    1445             :     integer, intent(in) :: total
    1446             :     integer, intent(in) :: num_pieces
    1447             :     integer, intent(out) :: ibeg(num_pieces), iend(num_pieces)
    1448             : 
    1449             : ! Local workspace
    1450             : 
    1451             :     integer :: itmp1, itmp2, ioffset, i
    1452             :     character(len=*),parameter :: subname = '(ice_reprosum_mod:split_indices)'
    1453             : 
    1454             : !-----------------------------------------------------------------------
    1455             : 
    1456           0 :     itmp1 = total/num_pieces
    1457           0 :     itmp2 = mod(total,num_pieces)
    1458           0 :     ioffset = 0
    1459           0 :     do i=1,itmp2
    1460           0 :        ibeg(i) = ioffset + 1
    1461           0 :        iend(i) = ioffset + (itmp1+1)
    1462           0 :        ioffset = iend(i)
    1463             :     enddo
    1464           0 :     do i=itmp2+1,num_pieces
    1465           0 :        ibeg(i) = ioffset + 1
    1466           0 :        if (ibeg(i) > total) then
    1467           0 :           iend(i) = ibeg(i) - 1
    1468             :        else
    1469           0 :           iend(i) = ioffset + itmp1
    1470           0 :           ioffset = iend(i)
    1471             :        endif
    1472             :     enddo
    1473             : 
    1474           0 :   end subroutine split_indices
    1475             : 
    1476             : !========================================================================
    1477             : 
    1478             : end module ice_reprosum

Generated by: LCOV version 1.14-6-g40580cd