Line data Source code
1 :
2 : !> Reproducible sum method from P. Worley
3 :
4 : MODULE ice_reprosum
5 :
6 : !-----------------------------------------------------------------------
7 : !
8 : ! Purpose:
9 : !> Compute reproducible global sums of a set of arrays across an MPI
10 : !> subcommunicator
11 : !
12 : ! Methods:
13 : !> Compute using either or both a scalable, reproducible algorithm and a
14 : !> scalable, nonreproducible algorithm:
15 : !> * Reproducible (scalable):
16 : !> Convert to fixed point (integer vector representation) to enable
17 : !> reproducibility when using MPI_Allreduce
18 : !> * Alternative usually reproducible (scalable):
19 : !> Use parallel double-double algorithm due to Helen He and
20 : !> Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
21 : !> * Nonreproducible (scalable):
22 : !> Floating point and MPI_Allreduce based.
23 : !> If computing both reproducible and nonreproducible sums, compare
24 : !> these and report relative difference (if absolute difference
25 : !> less than sum) or absolute difference back to calling routine.
26 : !
27 : ! Author: P. Worley (based on suggestions from J. White for fixed
28 : ! point algorithm and on He/Ding paper for ddpdd
29 : ! algorithm)
30 : !
31 : ! Modified by T.Craig for CICE, March 2019 based on the public version in
32 : ! Oasis3-MCT_4.0.
33 : !
34 : !-----------------------------------------------------------------------
35 :
36 : !-----------------------------------------------------------------------
37 : !- use statements ------------------------------------------------------
38 : !-----------------------------------------------------------------------
39 : #ifndef SERIAL_REMOVE_MPI
40 : use mpi ! MPI Fortran module
41 : #endif
42 : #if defined (NO_I8)
43 : ! Workaround for when shr_kind_i8 is not supported.
44 : use ice_kinds_mod, only: r8 => dbl_kind, i8 => int_kind
45 : #else
46 : use ice_kinds_mod, only: r8 => dbl_kind, i8 => int8_kind
47 : #endif
48 : use ice_kinds_mod, only: char_len_long
49 : use ice_fileunits, only: nu_diag
50 : use ice_exit, only: abort_ice
51 :
52 : ! internal timers not yet implemented, need to revisit if needed
53 : ! use ice_mpi, only: xicex_mpi_barrier
54 : ! use ice_timer, only: xicex_timer_start, xicex_timer_stop
55 :
56 : !-----------------------------------------------------------------------
57 : !- module boilerplate --------------------------------------------------
58 : !-----------------------------------------------------------------------
59 : implicit none
60 : private
61 :
62 : !-----------------------------------------------------------------------
63 : ! Public interfaces ----------------------------------------------------
64 : !-----------------------------------------------------------------------
65 : public :: &
66 : ice_reprosum_setopts, &! set runtime options ! LCOV_EXCL_LINE
67 : ice_reprosum_calc, &! calculate distributed sum ! LCOV_EXCL_LINE
68 : ice_reprosum_tolExceeded ! utility function to check relative
69 : ! differences against the tolerance
70 :
71 : !-----------------------------------------------------------------------
72 : ! Public data ----------------------------------------------------------
73 : !-----------------------------------------------------------------------
74 : logical, public :: ice_reprosum_recompute = .false.
75 :
76 : real(r8), public :: ice_reprosum_reldiffmax = -1.0_r8
77 :
78 : !-----------------------------------------------------------------------
79 : ! Private interfaces ---------------------------------------------------
80 : !-----------------------------------------------------------------------
81 : private :: &
82 : ddpdd, &! double-double sum routine ! LCOV_EXCL_LINE
83 : split_indices ! split indices among OMP threads
84 :
85 : !-----------------------------------------------------------------------
86 : ! Private data ---------------------------------------------------------
87 : !-----------------------------------------------------------------------
88 :
89 : logical :: repro_sum_use_ddpdd = .false.
90 : logical :: detailed_timing = .false.
91 : character(len=char_len_long) :: tmpstr
92 :
93 : CONTAINS
94 :
95 : !========================================================================
96 : !-----------------------------------------------------------------------
97 : ! Purpose:
98 : !> Set runtime options
99 : ! Author: P. Worley
100 : !-----------------------------------------------------------------------
101 :
102 0 : subroutine ice_reprosum_setopts(repro_sum_use_ddpdd_in, &
103 : repro_sum_rel_diff_max_in, & ! LCOV_EXCL_LINE
104 : repro_sum_recompute_in, & ! LCOV_EXCL_LINE
105 : repro_sum_master, & ! LCOV_EXCL_LINE
106 : repro_sum_logunit )
107 :
108 : !------------------------------Arguments--------------------------------
109 : logical, intent(in), optional :: repro_sum_use_ddpdd_in
110 : !< Use DDPDD algorithm instead of fixed precision algorithm
111 : real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
112 : !< maximum permissible difference between reproducible and
113 : !< nonreproducible sums
114 : logical, intent(in), optional :: repro_sum_recompute_in
115 : !< recompute using different algorithm when difference between
116 : !< reproducible and nonreproducible sums is too great
117 : logical, intent(in), optional :: repro_sum_master
118 : !< flag indicating whether this process should output
119 : !< log messages
120 : integer, intent(in), optional :: repro_sum_logunit
121 : !< unit number for log messages
122 : !---------------------------Local Workspace-----------------------------
123 : integer llogunit ! unit number for log messages
124 : logical master ! local master?
125 : logical,save :: firstcall = .true. ! first call
126 : character(len=*),parameter :: subname = '(ice_reprosum_setopts)'
127 : !-----------------------------------------------------------------------
128 :
129 0 : if ( present(repro_sum_master) ) then
130 0 : master = repro_sum_master
131 : else
132 0 : master = .false.
133 : endif
134 :
135 0 : if ( present(repro_sum_logunit) ) then
136 0 : llogunit = repro_sum_logunit
137 : else
138 0 : llogunit = nu_diag
139 : endif
140 :
141 0 : if (.not. firstcall) then
142 0 : write(tmpstr,*) subname//' ERROR: can only be called once'
143 0 : call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
144 : endif
145 0 : firstcall = .false.
146 :
147 0 : if ( present(repro_sum_use_ddpdd_in) ) then
148 0 : repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
149 : endif
150 0 : if ( present(repro_sum_rel_diff_max_in) ) then
151 0 : ice_reprosum_reldiffmax = repro_sum_rel_diff_max_in
152 : endif
153 0 : if ( present(repro_sum_recompute_in) ) then
154 0 : ice_reprosum_recompute = repro_sum_recompute_in
155 : endif
156 0 : if (master) then
157 0 : if ( repro_sum_use_ddpdd ) then
158 0 : write(llogunit,*) subname, &
159 : 'Using double-double-based (scalable) usually reproducible ', & ! LCOV_EXCL_LINE
160 0 : 'distributed sum algorithm'
161 : else
162 0 : write(llogunit,*) subname, &
163 : 'Using fixed-point-based (scalable) reproducible ', & ! LCOV_EXCL_LINE
164 0 : 'distributed sum algorithm'
165 : endif
166 :
167 0 : if (ice_reprosum_reldiffmax >= 0._r8) then
168 0 : write(llogunit,*) subname, &
169 : ' with a maximum relative error tolerance of ', & ! LCOV_EXCL_LINE
170 0 : ice_reprosum_reldiffmax
171 0 : if (ice_reprosum_recompute) then
172 0 : write(llogunit,*) subname, &
173 : 'If tolerance exceeded, sum is recomputed using ', & ! LCOV_EXCL_LINE
174 0 : 'a serial algorithm.'
175 : else
176 0 : write(llogunit,*) subname, &
177 : 'If tolerance exceeded, fixed-precision is sum used ', & ! LCOV_EXCL_LINE
178 0 : 'but a warning is output.'
179 : endif
180 : else
181 0 : write(llogunit,*) subname, &
182 0 : 'and not comparing with floating point algorithms.'
183 : endif
184 :
185 : endif
186 0 : end subroutine ice_reprosum_setopts
187 :
188 : !========================================================================
189 :
190 : !
191 : ! Purpose:
192 : !> Compute the global sum of each field in "arr" using the indicated
193 : !> communicator with a reproducible yet scalable implementation based
194 : !> on a fixed point algorithm. An alternative is to use an "almost
195 : !> always reproducible" floating point algorithm.
196 : !
197 : ! The accuracy of the fixed point algorithm is controlled by the
198 : ! number of "levels" of integer expansion. The algorithm will calculate
199 : ! the number of levels that is required for the sum to be essentially
200 : ! exact. The optional parameter arr_max_levels can be used to override
201 : ! the calculated value. The optional parameter arr_max_levels_out can be
202 : ! used to return the values used.
203 : !
204 : ! The algorithm also requires an upper bound on
205 : ! the maximum summand (in absolute value) for each field, and will
206 : ! calculate this internally. However, if the optional parameters
207 : ! arr_max_levels and arr_gbl_max are both set, then the algorithm will
208 : ! use the values in arr_gbl_max for the upper bounds instead. If these
209 : ! are not upper bounds, or if the upper bounds are not tight enough
210 : ! to achieve the requisite accuracy, and if the optional parameter
211 : ! repro_sum_validate is NOT set to .false., the algorithm will repeat the
212 : ! computation with appropriate upper bounds. If only arr_gbl_max is present,
213 : ! then the maxima are computed internally (and the specified values are
214 : ! ignored). The optional parameter arr_gbl_max_out can be
215 : ! used to return the values used.
216 : !
217 : ! Finally, the algorithm requires an upper bound on the number of
218 : ! local summands across all processes. This will be calculated internally,
219 : ! using an MPI collective, but the value in the optional argument
220 : ! gbl_max_nsummands will be used instead if (1) it is present, (2)
221 : ! it is > 0, and (3) the maximum value and required number of levels
222 : ! are also specified. (If the maximum value is calculated, the same
223 : ! MPI collective is used to determine the maximum number of local
224 : ! summands.) The accuracy of the user-specified value is not checked.
225 : ! However, if set to < 1, the value will instead be calculated. If the
226 : ! optional parameter gbl_max_nsummands_out is present, then the value
227 : ! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
228 : ! returned.
229 : !
230 : ! If requested (by setting ice_reprosum_reldiffmax >= 0.0 and passing in
231 : ! the optional rel_diff parameter), results are compared with a
232 : ! nonreproducible floating point algorithm.
233 : !
234 : ! Note that the cost of the algorithm is not strongly correlated with
235 : ! the number of levels, which primarily shows up as a (modest) increase
236 : ! in cost of the MPI_Allreduce as a function of vector length. Rather the
237 : ! cost is more a function of (a) the number of integers required to
238 : ! represent an individual summand and (b) the number of MPI_Allreduce
239 : ! calls. The number of integers required to represent an individual
240 : ! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
241 : ! when the number of local summands is not too large. As the number of
242 : ! local summands increases, the number of integers required increases.
243 : ! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
244 : ! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
245 : ! correctly). When specifying arr_max_levels and arr_gbl_max
246 : ! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
247 : !
248 : ! The alternative algorithm is a minor modification of a parallel
249 : ! implementation of David Bailey's routine DDPDD by Helen He
250 : ! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
251 : ! precision summation of double precision values with 10 double
252 : ! precision operations. The advantage of this algorithm is that
253 : ! it requires a single MPI_Allreduce and is less expensive per summand
254 : ! than is the fixed precision algorithm. The disadvantage is that it
255 : ! is not guaranteed to be reproducible (though it is reproducible
256 : ! much more often than is the standard algorithm). This alternative
257 : ! is used when the optional parameter ddpdd_sum is set to .true. It is
258 : ! also used if the fixed precision algorithm radix assumption does not
259 : ! hold.
260 :
261 : !----------------------------------------------------------------------
262 0 : subroutine ice_reprosum_calc (arr, arr_gsum, nsummands, dsummands, &
263 : nflds, ddpdd_sum, & ! LCOV_EXCL_LINE
264 : arr_gbl_max, arr_gbl_max_out, & ! LCOV_EXCL_LINE
265 : arr_max_levels, arr_max_levels_out, & ! LCOV_EXCL_LINE
266 : gbl_max_nsummands, gbl_max_nsummands_out,& ! LCOV_EXCL_LINE
267 : gbl_count, repro_sum_validate, & ! LCOV_EXCL_LINE
268 0 : repro_sum_stats, rel_diff, commid )
269 : !----------------------------------------------------------------------
270 :
271 : ! Arguments
272 :
273 : integer, intent(in) :: nsummands !< number of local summands
274 : integer, intent(in) :: dsummands !< declared first dimension
275 : integer, intent(in) :: nflds !< number of fields
276 : real(r8), intent(in) :: arr(dsummands,nflds)
277 : !< input array
278 :
279 : real(r8), intent(out):: arr_gsum(nflds)
280 : !< global means
281 :
282 : logical, intent(in), optional :: ddpdd_sum
283 : !< use ddpdd algorithm instead
284 : !< of fixed precision algorithm
285 :
286 : real(r8), intent(in), optional :: arr_gbl_max(nflds)
287 : !< upper bound on max(abs(arr))
288 :
289 : real(r8), intent(out), optional :: arr_gbl_max_out(nflds)
290 : !< calculated upper bound on
291 : !< max(abs(arr))
292 :
293 : integer, intent(in), optional :: arr_max_levels(nflds)
294 : !< maximum number of levels of
295 : !< integer expansion to use
296 :
297 : integer, intent(out), optional :: arr_max_levels_out(nflds)
298 : !< output of number of levels of
299 : !< integer expansion to used
300 :
301 : integer, intent(in), optional :: gbl_max_nsummands
302 : !< maximum of nsummand over all
303 : !< processes
304 :
305 : integer, intent(out), optional :: gbl_max_nsummands_out
306 : !< calculated maximum nsummands
307 : !< over all processes
308 :
309 : integer, intent(in), optional :: gbl_count
310 : !< was total number of summands;
311 : !< now is ignored; use
312 : !< gbl_max_nsummands instead
313 :
314 : logical, intent(in), optional :: repro_sum_validate
315 : !< flag enabling/disabling testing that gmax and max_levels are
316 : !< accurate/sufficient. Default is enabled.
317 :
318 : integer, intent(inout), optional :: repro_sum_stats(5)
319 : !< increment running totals for
320 : !< (1) one-reduction repro_sum
321 : !< (2) two-reduction repro_sum
322 : !< (3) both types in one call
323 : !< (4) nonrepro_sum
324 : !< (5) global max nsummands reduction
325 :
326 : real(r8), intent(out), optional :: rel_diff(2,nflds)
327 : !< relative and absolute
328 : !< differences between fixed
329 : !< and floating point sums
330 :
331 : integer, intent(in), optional :: commid
332 : !< MPI communicator
333 :
334 : ! Local workspace
335 :
336 : logical :: use_ddpdd_sum ! flag indicating whether to
337 : ! use ice_reprosum_ddpdd or not
338 : logical :: recompute ! flag indicating need to
339 : ! determine gmax/gmin before
340 : ! computing sum
341 : logical :: validate ! flag indicating need to
342 : ! verify gmax and max_levels
343 : ! are accurate/sufficient
344 : integer :: omp_nthreads ! number of OpenMP threads
345 : integer :: mpi_comm ! MPI subcommunicator
346 : integer :: tasks ! number of MPI processes
347 : integer :: mype ! MPI task rank
348 : integer :: ierr ! MPI error return
349 : integer :: ifld, isum, ithread ! loop variables
350 : integer :: max_nsummands ! max nsummands over all processes
351 : ! or threads (used in both ways)
352 :
353 0 : integer, allocatable :: isum_beg(:), isum_end(:)
354 : ! range of summand indices for each
355 : ! OpenMP thread
356 0 : integer, allocatable :: arr_tlmin_exp(:,:)
357 : ! per thread local exponent minima
358 0 : integer, allocatable :: arr_tlmax_exp(:,:)
359 : ! per thread local exponent maxima
360 : integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
361 : ! summand exponent and working min/max
362 0 : integer :: arr_lmin_exp(nflds) ! local exponent minima
363 0 : integer :: arr_lmax_exp(nflds) ! local exponent maxima
364 0 : integer :: arr_lextremes(0:nflds,2)! local exponent extrema
365 0 : integer :: arr_gextremes(0:nflds,2)! global exponent extrema
366 :
367 0 : integer :: arr_gmax_exp(nflds) ! global exponents maxima
368 0 : integer :: arr_gmin_exp(nflds) ! global exponents minima
369 : integer :: arr_max_shift ! maximum safe exponent for
370 : ! value < 1 (so that sum does
371 : ! not overflow)
372 0 : integer :: max_levels(nflds) ! maximum number of levels of
373 : ! integer expansion to use
374 : integer :: max_level ! maximum value in max_levels
375 : integer :: gbl_max_red ! global max local sum reduction? (0/1)
376 : integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1)
377 : integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1)
378 : integer :: repro_sum_both ! both fast and slow? (0/1)
379 : integer :: nonrepro_sum ! nonrepro_sum? (0/1)
380 :
381 0 : real(r8) :: xmax_nsummands ! dble of max_nsummands
382 0 : real(r8) :: arr_lsum(nflds) ! local sums
383 0 : real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using
384 : ! fast, nonreproducible,
385 : ! floating point alg.
386 0 : real(r8) :: abs_diff ! absolute difference between
387 : ! fixed and floating point
388 : ! sums
389 : #ifdef _OPENMP
390 : integer omp_get_max_threads
391 : external omp_get_max_threads
392 : #endif
393 : character(len=*),parameter :: subname = '(ice_reprosum_calc)'
394 :
395 : !-----------------------------------------------------------------------
396 :
397 : ! check whether should use ice_reprosum_ddpdd algorithm
398 0 : use_ddpdd_sum = repro_sum_use_ddpdd
399 0 : if ( present(ddpdd_sum) ) then
400 0 : use_ddpdd_sum = ddpdd_sum
401 : endif
402 :
403 : ! check whether intrinsic-based algorithm will work on this system
404 : ! (requires floating point and integer bases to be the same)
405 : ! If not, always use ddpdd.
406 0 : use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
407 :
408 : ! initialize local statistics variables
409 0 : gbl_max_red = 0
410 0 : repro_sum_fast = 0
411 0 : repro_sum_slow = 0
412 0 : repro_sum_both = 0
413 0 : nonrepro_sum = 0
414 :
415 : ! set MPI communicator
416 0 : if ( present(commid) ) then
417 0 : mpi_comm = commid
418 : else
419 : #ifdef SERIAL_REMOVE_MPI
420 : mpi_comm = 0
421 : #else
422 0 : mpi_comm = MPI_COMM_WORLD
423 : #endif
424 : endif
425 :
426 : ! if (detailed_timing) then
427 : ! call xicex_timer_start('xicex_reprosum_prebarrier')
428 : ! call xicex_mpi_barrier(mpi_comm,subname)
429 : ! call xicex_timer_stop ('xicex_reprosum_prebarrier')
430 : ! endif
431 :
432 0 : if ( use_ddpdd_sum ) then
433 :
434 : ! if (detailed_timing) call xicex_timer_start('ice_reprosum_ddpdd')
435 :
436 : call ice_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
437 0 : nflds, mpi_comm)
438 0 : repro_sum_fast = 1
439 :
440 : ! if (detailed_timing) call xicex_timer_stop('ice_reprosum_ddpdd')
441 :
442 : else
443 :
444 : ! if (detailed_timing) call xicex_timer_start('ice_reprosum_int')
445 :
446 : ! get number of MPI tasks
447 : #ifdef SERIAL_REMOVE_MPI
448 : tasks = 1
449 : mype = 0
450 : #else
451 0 : call mpi_comm_size(mpi_comm, tasks, ierr)
452 0 : call mpi_comm_rank(mpi_comm, mype, ierr)
453 : #endif
454 :
455 : ! get number of OpenMP threads
456 : #ifdef _OPENMP
457 0 : omp_nthreads = omp_get_max_threads()
458 : #else
459 0 : omp_nthreads = 1
460 : #endif
461 :
462 : ! see if have sufficient information to not require max/min allreduce
463 0 : recompute = .true.
464 0 : validate = .false.
465 0 : if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
466 0 : recompute = .false.
467 :
468 : ! setting lower bound on max_level*nflds to be 64 to improve OpenMP
469 : ! performance for loopb in ice_reprosum_int
470 0 : max_level = (64/nflds) + 1
471 0 : do ifld=1,nflds
472 0 : if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
473 0 : (arr_max_levels(ifld) > 0)) then
474 :
475 0 : arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld))
476 0 : if (max_level < arr_max_levels(ifld)) &
477 0 : max_level = arr_max_levels(ifld)
478 :
479 : else
480 0 : recompute = .true.
481 : endif
482 : enddo
483 :
484 0 : if (.not. recompute) then
485 :
486 : ! determine maximum number of summands in local phases of the
487 : ! algorithm
488 : ! if (detailed_timing) call xicex_timer_start("repro_sum_allr_max")
489 0 : if ( present(gbl_max_nsummands) ) then
490 0 : if (gbl_max_nsummands < 1) then
491 : #ifdef SERIAL_REMOVE_MPI
492 : max_nsummands = nsummands
493 : #else
494 : call mpi_allreduce (nsummands, max_nsummands, 1, &
495 0 : MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
496 : #endif
497 0 : gbl_max_red = 1
498 : else
499 0 : max_nsummands = gbl_max_nsummands
500 : endif
501 : else
502 : #ifdef SERIAL_REMOVE_MPI
503 : max_nsummands = nsummands
504 : #else
505 : call mpi_allreduce (nsummands, max_nsummands, 1, &
506 0 : MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
507 : #endif
508 0 : gbl_max_red = 1
509 : endif
510 : ! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_max")
511 :
512 : ! determine maximum shift. Shift needs to be small enough that summation
513 : ! does not exceed maximum number of digits in i8.
514 :
515 : ! if requested, return max_nsummands before it is redefined
516 0 : if ( present( gbl_max_nsummands_out) ) then
517 0 : gbl_max_nsummands_out = max_nsummands
518 : endif
519 :
520 : ! summing within each thread first
521 0 : max_nsummands = (max_nsummands/omp_nthreads) + 1
522 : ! then over threads and tasks
523 0 : max_nsummands = max(max_nsummands, tasks*omp_nthreads)
524 :
525 0 : xmax_nsummands = dble(max_nsummands)
526 0 : arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
527 0 : if (arr_max_shift < 2) then
528 0 : write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
529 0 : call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
530 : endif
531 :
532 : ! calculate sum
533 0 : if (present(repro_sum_validate)) then
534 0 : validate = repro_sum_validate
535 : else
536 0 : validate = .true.
537 : endif
538 : call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
539 : nflds, arr_max_shift, arr_gmax_exp, & ! LCOV_EXCL_LINE
540 : arr_max_levels, max_level, validate, & ! LCOV_EXCL_LINE
541 0 : recompute, omp_nthreads, mpi_comm)
542 :
543 : ! record statistics, etc.
544 0 : repro_sum_fast = 1
545 0 : if (recompute) then
546 0 : repro_sum_both = 1
547 : else
548 : ! if requested, return specified levels and upper bounds on maxima
549 0 : if ( present(arr_max_levels_out) ) then
550 0 : do ifld=1,nflds
551 0 : arr_max_levels_out(ifld) = arr_max_levels(ifld)
552 : enddo
553 : endif
554 0 : if ( present(arr_gbl_max_out) ) then
555 0 : do ifld=1,nflds
556 0 : arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
557 : enddo
558 : endif
559 : endif
560 : endif
561 : endif
562 :
563 : ! do not have sufficient information; calculate global max/min and
564 : ! use to compute required number of levels
565 0 : if (recompute) then
566 :
567 : ! record statistic
568 0 : repro_sum_slow = 1
569 :
570 : ! determine maximum and minimum (non-zero) summand values and
571 : ! maximum number of local summands
572 :
573 : ! allocate thread-specific work space
574 0 : allocate(arr_tlmax_exp(nflds,omp_nthreads))
575 0 : allocate(arr_tlmin_exp(nflds,omp_nthreads))
576 0 : allocate(isum_beg(omp_nthreads))
577 0 : allocate(isum_end(omp_nthreads))
578 :
579 : ! split summand index range over OpenMP threads
580 0 : call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
581 :
582 : !$omp parallel do &
583 : !$omp default(shared) & ! LCOV_EXCL_LINE
584 0 : !$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
585 0 : do ithread=1,omp_nthreads
586 : ! if (detailed_timing) call xicex_timer_start('repro_sum_loopa')
587 0 : do ifld=1,nflds
588 0 : arr_exp_tlmin = MAXEXPONENT(1._r8)
589 0 : arr_exp_tlmax = MINEXPONENT(1._r8)
590 0 : do isum=isum_beg(ithread),isum_end(ithread)
591 0 : if (arr(isum,ifld) .ne. 0.0_r8) then
592 0 : arr_exp = exponent(arr(isum,ifld))
593 0 : arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
594 0 : arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
595 : endif
596 : end do
597 0 : arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
598 0 : arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
599 : end do
600 : ! if (detailed_timing) call xicex_timer_stop('repro_sum_loopa')
601 : end do
602 :
603 0 : do ifld=1,nflds
604 0 : arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
605 0 : arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
606 : end do
607 0 : deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
608 :
609 0 : arr_lextremes(0,:) = -nsummands
610 0 : arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
611 0 : arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
612 : ! if (detailed_timing) call xicex_timer_start("repro_sum_allr_minmax")
613 : #ifdef SERIAL_REMOVE_MPI
614 : arr_gextremes = arr_lextremes
615 : #else
616 : call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
617 0 : MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
618 : #endif
619 : ! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_minmax")
620 0 : max_nsummands = -arr_gextremes(0,1)
621 0 : arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
622 0 : arr_gmin_exp(:) = arr_gextremes(1:nflds,2)
623 :
624 : ! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
625 : ! and arr_gmax_exp still equals MINEXPONENT. In this case, set
626 : ! arr_gmin_exp = arr_gmax_exp = MINEXPONENT
627 0 : do ifld=1,nflds
628 0 : arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
629 : enddo
630 :
631 : ! if requested, return upper bounds on observed maxima
632 0 : if ( present(arr_gbl_max_out) ) then
633 0 : do ifld=1,nflds
634 0 : arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
635 : enddo
636 : endif
637 :
638 : ! if requested, return max_nsummands before it is redefined
639 0 : if ( present( gbl_max_nsummands_out) ) then
640 0 : gbl_max_nsummands_out = max_nsummands
641 : endif
642 :
643 : ! determine maximum shift (same as in previous branch, but with calculated
644 : ! max_nsummands). Shift needs to be small enough that summation does not
645 : ! exceed maximum number of digits in i8.
646 :
647 : ! summing within each thread first
648 0 : max_nsummands = (max_nsummands/omp_nthreads) + 1
649 : ! then over threads and tasks
650 0 : max_nsummands = max(max_nsummands, tasks*omp_nthreads)
651 :
652 0 : xmax_nsummands = dble(max_nsummands)
653 0 : arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
654 0 : if (arr_max_shift < 2) then
655 0 : write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
656 0 : call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
657 : endif
658 :
659 : ! determine maximum number of levels required for each field
660 : ! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
661 : ! + 1 because first truncation probably does not involve a maximal shift
662 : ! + 1 to guarantee that the integer division rounds up (not down)
663 : ! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
664 : ! performance for loopb in ice_reprosum_int)
665 0 : max_level = (64/nflds) + 1
666 0 : do ifld=1,nflds
667 0 : max_levels(ifld) = 2 + &
668 : ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) & ! LCOV_EXCL_LINE
669 0 : / arr_max_shift)
670 0 : if ( present(arr_max_levels) .and. (.not. validate) ) then
671 : ! if validate true, then computation with arr_max_levels failed
672 : ! previously
673 0 : if ( arr_max_levels(ifld) > 0 ) then
674 0 : max_levels(ifld) = &
675 0 : min(arr_max_levels(ifld),max_levels(ifld))
676 : endif
677 : endif
678 0 : if (max_level < max_levels(ifld)) &
679 0 : max_level = max_levels(ifld)
680 : enddo
681 :
682 : ! if requested, return calculated levels
683 0 : if ( present(arr_max_levels_out) ) then
684 0 : do ifld=1,nflds
685 0 : arr_max_levels_out(ifld) = max_levels(ifld)
686 : enddo
687 : endif
688 :
689 : ! calculate sum
690 0 : validate = .false.
691 : call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
692 : arr_max_shift, arr_gmax_exp, max_levels, & ! LCOV_EXCL_LINE
693 : max_level, validate, recompute, & ! LCOV_EXCL_LINE
694 0 : omp_nthreads, mpi_comm)
695 :
696 : endif
697 :
698 : ! if (detailed_timing) call xicex_timer_stop('ice_reprosum_int')
699 :
700 : endif
701 :
702 : ! compare fixed and floating point results
703 0 : if ( present(rel_diff) ) then
704 0 : if (ice_reprosum_reldiffmax >= 0.0_r8) then
705 :
706 : ! if (detailed_timing) then
707 : ! call xicex_timer_start('xicex_nonreprosum_prebarrier')
708 : ! call xicex_mpi_barrier(mpi_comm,subname)
709 : ! call xicex_timer_stop ('xicex_nonreprosum_prebarrier')
710 : ! endif
711 :
712 : ! if (detailed_timing) call xicex_timer_start('nonrepro_sum')
713 : ! record statistic
714 0 : nonrepro_sum = 1
715 : ! compute nonreproducible sum
716 0 : arr_lsum(:) = 0._r8
717 : !$omp parallel do &
718 : !$omp default(shared) & ! LCOV_EXCL_LINE
719 0 : !$omp private(ifld, isum)
720 0 : do ifld=1,nflds
721 0 : do isum=1,nsummands
722 0 : arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
723 : end do
724 : end do
725 :
726 : #ifdef SERIAL_REMOVE_MPI
727 : arr_gsum_fast = arr_lsum
728 : #else
729 : call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
730 0 : MPI_REAL8, MPI_SUM, mpi_comm, ierr)
731 : #endif
732 :
733 : ! if (detailed_timing) call xicex_timer_stop('nonrepro_sum')
734 :
735 : ! determine differences
736 : !$omp parallel do &
737 : !$omp default(shared) & ! LCOV_EXCL_LINE
738 0 : !$omp private(ifld, abs_diff)
739 0 : do ifld=1,nflds
740 0 : abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
741 0 : if (abs(arr_gsum(ifld)) > abs_diff) then
742 0 : rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
743 : else
744 0 : rel_diff(1,ifld) = abs_diff
745 : endif
746 0 : rel_diff(2,ifld) = abs_diff
747 : enddo
748 : else
749 0 : rel_diff(:,:) = 0.0_r8
750 : endif
751 : endif
752 :
753 : ! return statistics
754 0 : if ( present(repro_sum_stats) ) then
755 0 : repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
756 0 : repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
757 0 : repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
758 0 : repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
759 0 : repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
760 : endif
761 :
762 :
763 0 : end subroutine ice_reprosum_calc
764 :
765 : !========================================================================
766 : !
767 : ! Purpose:
768 : !> Compute the global sum of each field in "arr" using the indicated
769 : !> communicator with a reproducible yet scalable implementation based
770 : !> on a fixed point algorithm. The accuracy of the fixed point algorithm
771 : !> is controlled by the number of "levels" of integer expansion, the
772 : !> maximum value of which is specified by max_level.
773 : !
774 : !----------------------------------------------------------------------
775 :
776 0 : subroutine ice_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
777 : arr_max_shift, arr_gmax_exp, max_levels, & ! LCOV_EXCL_LINE
778 : max_level, validate, recompute, & ! LCOV_EXCL_LINE
779 : omp_nthreads, mpi_comm )
780 :
781 : !----------------------------------------------------------------------
782 :
783 : ! Arguments
784 :
785 : integer, intent(in) :: nsummands !< number of local summands
786 : integer, intent(in) :: dsummands !< declared first dimension
787 : integer, intent(in) :: nflds !< number of fields
788 : integer, intent(in) :: arr_max_shift !< maximum safe exponent for
789 : !< value < 1 (so that sum
790 : !< does not overflow)
791 : integer, intent(in) :: arr_gmax_exp(nflds)
792 : !< exponents of global maxima
793 : integer, intent(in) :: max_levels(nflds)
794 : !< maximum number of levels
795 : !< of integer expansion
796 : integer, intent(in) :: max_level !< maximum value in
797 : !< max_levels
798 : integer, intent(in) :: omp_nthreads !< number of OpenMP threads
799 : integer, intent(in) :: mpi_comm !< MPI subcommunicator
800 :
801 : real(r8), intent(in) :: arr(dsummands,nflds)
802 : !< input array
803 :
804 : logical, intent(in):: validate
805 : !< flag indicating that accuracy of solution generated from
806 : !< arr_gmax_exp and max_levels should be tested
807 :
808 : logical, intent(out):: recompute
809 : !< flag indicating that either the upper bounds are inaccurate,
810 : !< or max_levels and arr_gmax_exp do not generate accurate
811 : !< enough sums
812 :
813 : real(r8), intent(out):: arr_gsum(nflds) !< global means
814 :
815 : ! Local workspace
816 :
817 : integer, parameter :: max_jlevel = &
818 : 1 + (digits(0_i8)/digits(0.0_r8))
819 :
820 0 : integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
821 : ! integer vector representing local
822 : ! sum (per thread, per field)
823 0 : integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
824 : ! integer vector representing local
825 : ! sum
826 : integer(i8) :: i8_arr_level ! integer part of summand for current
827 : ! expansion level
828 0 : integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
829 : ! integer vector representing global
830 : ! sum
831 : integer(i8) :: IX_8 ! integer representation of current
832 : ! jlevels of X_8 ('part' of
833 : ! i8_arr_gsum_level)
834 : integer(i8) :: i8_sign ! sign global sum
835 : integer(i8) :: i8_radix ! radix for i8 variables
836 :
837 0 : integer :: max_error(nflds,omp_nthreads)
838 : ! accurate upper bound on data?
839 0 : integer :: not_exact(nflds,omp_nthreads)
840 : ! max_levels sufficient to
841 : ! capture all digits?
842 0 : integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
843 : ! range of summand indices for each
844 : ! OpenMP thread
845 : integer :: ifld, isum, ithread
846 : ! loop variables
847 : integer :: arr_exp ! exponent of summand
848 : integer :: arr_shift ! exponent used to generate integer
849 : ! for current expansion level
850 : integer :: ilevel ! current integer expansion level
851 0 : integer :: offset(nflds) ! beginning location in
852 : ! i8_arr_{g,l}sum_level for integer
853 : ! expansion of current ifld
854 : integer :: voffset ! modification to offset used to
855 : ! include validation metrics
856 : integer :: ioffset ! offset(ifld)
857 : integer :: jlevel ! number of floating point 'pieces'
858 : ! extracted from a given i8 integer
859 : integer :: ierr ! MPI error return
860 : integer :: LX(max_jlevel) ! exponent of X_8 (see below)
861 : integer :: veclth ! total length of i8_arr_lsum_level
862 : integer :: sum_digits ! lower bound on number of significant
863 : ! in integer expansion of sum
864 : integer :: curr_exp ! exponent of partial sum during
865 : ! reconstruction from integer vector
866 : integer :: corr_exp ! exponent of current summand in
867 : ! reconstruction from integer vector
868 :
869 0 : real(r8) :: arr_frac ! fraction of summand
870 0 : real(r8) :: arr_remainder ! part of summand remaining after
871 : ! current level of integer expansion
872 0 : real(r8) :: X_8(max_jlevel) ! r8 vector representation of current
873 : ! i8_arr_gsum_level
874 0 : real(r8) :: RX_8 ! r8 representation of difference
875 : ! between current i8_arr_gsum_level
876 : ! and current jlevels of X_8
877 : ! (== IX_8). Also used in final
878 : ! scaling step
879 :
880 : logical :: first ! flag used to indicate that just
881 : ! beginning reconstruction of sum
882 : ! from integer vector
883 :
884 : character(len=*),parameter :: subname = '(ice_reprosum_int)'
885 :
886 : !-----------------------------------------------------------------------
887 : ! Save radix of i8 variables in an i8 variable
888 0 : i8_radix = radix(IX_8)
889 :
890 : ! If validating upper bounds, reserve space for validation metrics
891 : ! In both cases, reserve an extra level for overflow from the top level
892 0 : if (validate) then
893 0 : voffset = 3
894 : else
895 0 : voffset = 1
896 : endif
897 :
898 : ! compute offsets for each field
899 0 : offset(1) = voffset
900 0 : do ifld=2,nflds
901 0 : offset(ifld) = offset(ifld-1) &
902 0 : + (max_levels(ifld-1) + voffset)
903 : enddo
904 0 : veclth = offset(nflds) + max_levels(nflds)
905 :
906 : ! split summand index range over OpenMP threads
907 0 : call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
908 :
909 : ! convert local summands to vector of integers and sum
910 : ! (Using scale instead of set_exponent because arr_remainder may not be
911 : ! "normal" after level 1 calculation)
912 0 : i8_arr_lsum_level(:) = 0_i8
913 :
914 : !$omp parallel do &
915 : !$omp default(shared) & ! LCOV_EXCL_LINE
916 : !$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, & ! LCOV_EXCL_LINE
917 0 : !$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
918 0 : do ithread=1,omp_nthreads
919 : ! if (detailed_timing) call xicex_timer_start('repro_sum_loopb')
920 0 : do ifld=1,nflds
921 0 : ioffset = offset(ifld)
922 :
923 0 : max_error(ifld,ithread) = 0
924 0 : not_exact(ifld,ithread) = 0
925 :
926 0 : i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
927 0 : do isum=isum_beg(ithread),isum_end(ithread)
928 0 : arr_remainder = 0.0_r8
929 :
930 0 : if (arr(isum,ifld) .ne. 0.0_r8) then
931 0 : arr_exp = exponent(arr(isum,ifld))
932 0 : arr_frac = fraction(arr(isum,ifld))
933 :
934 : ! test that global maximum upper bound is an upper bound
935 0 : if (arr_exp > arr_gmax_exp(ifld)) then
936 0 : max_error(ifld,ithread) = 1
937 0 : exit
938 : endif
939 :
940 : ! calculate first shift
941 0 : arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
942 :
943 : ! determine first (probably) nonzero level (assuming initial fraction is
944 : ! 'normal' - algorithm still works if this is not true)
945 : ! NOTE: this is critical; scale will set to zero if min exponent is too small.
946 0 : if (arr_shift < 1) then
947 0 : ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
948 0 : arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
949 :
950 0 : do while (arr_shift < 1)
951 0 : arr_shift = arr_shift + arr_max_shift
952 0 : ilevel = ilevel + 1
953 : enddo
954 : else
955 0 : ilevel = 1
956 : endif
957 :
958 0 : if (ilevel .le. max_levels(ifld)) then
959 : ! apply first shift/truncate, add it to the relevant running
960 : ! sum, and calculate the remainder.
961 0 : arr_remainder = scale(arr_frac,arr_shift)
962 0 : i8_arr_level = int(arr_remainder,i8)
963 : i8_arr_tlsum_level(ilevel,ifld,ithread) = &
964 0 : i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
965 0 : arr_remainder = arr_remainder - i8_arr_level
966 :
967 : ! while the remainder is non-zero, continue to shift, truncate,
968 : ! sum, and calculate new remainder
969 : do while ((arr_remainder .ne. 0.0_r8) &
970 0 : .and. (ilevel < max_levels(ifld)))
971 0 : ilevel = ilevel + 1
972 0 : arr_remainder = scale(arr_remainder,arr_max_shift)
973 0 : i8_arr_level = int(arr_remainder,i8)
974 : i8_arr_tlsum_level(ilevel,ifld,ithread) = &
975 0 : i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
976 0 : arr_remainder = arr_remainder - i8_arr_level
977 : enddo
978 :
979 : endif
980 : endif
981 :
982 0 : if (arr_remainder .ne. 0.0_r8) then
983 0 : not_exact(ifld,ithread) = 1
984 : endif
985 :
986 : enddo
987 :
988 : ! postprocess integer vector to eliminate potential for overlap in the following
989 : ! sums over threads and processes: if value larger than or equal to
990 : ! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
991 : ! vector, resulting in nonoverlapping ranges for each component. Note that
992 : ! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
993 : ! over threads and processes do not overflow for ilevel==1.
994 0 : do ilevel=max_levels(ifld),1,-1
995 0 : RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
996 0 : IX_8 = int(scale(RX_8,-arr_max_shift),i8)
997 0 : if (IX_8 .ne. 0_i8) then
998 : i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
999 0 : i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
1000 0 : IX_8 = IX_8*(i8_radix**arr_max_shift)
1001 : i8_arr_tlsum_level(ilevel,ifld,ithread) = &
1002 0 : i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
1003 : endif
1004 : enddo
1005 : enddo
1006 : ! if (detailed_timing) call xicex_timer_stop('repro_sum_loopb')
1007 : enddo
1008 :
1009 : ! sum contributions from different threads
1010 0 : do ifld=1,nflds
1011 0 : ioffset = offset(ifld)
1012 0 : do ithread = 1,omp_nthreads
1013 0 : do ilevel = 0,max_levels(ifld)
1014 0 : i8_arr_lsum_level(ioffset+ilevel) = &
1015 : i8_arr_lsum_level(ioffset+ilevel) & ! LCOV_EXCL_LINE
1016 0 : + i8_arr_tlsum_level(ilevel,ifld,ithread)
1017 : enddo
1018 : enddo
1019 : enddo
1020 :
1021 : ! record if upper bound was inaccurate or if level expansion stopped
1022 : ! before full accuracy was achieved
1023 0 : if (validate) then
1024 0 : do ifld=1,nflds
1025 0 : ioffset = offset(ifld)
1026 0 : i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:))
1027 0 : i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:))
1028 : enddo
1029 : endif
1030 :
1031 : ! sum integer vector element-wise
1032 : #ifdef SERIAL_REMOVE_MPI
1033 : i8_arr_gsum_level = i8_arr_lsum_level
1034 : #else
1035 : #if defined (NO_I8)
1036 : ! Workaround for when i8 is not supported.
1037 : ! if (detailed_timing) call xicex_timer_start("repro_sum_allr_i4")
1038 : call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
1039 : veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
1040 : ! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_i4")
1041 : #else
1042 : ! if (detailed_timing) call xicex_timer_start("repro_sum_allr_i8")
1043 : call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
1044 0 : veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
1045 : ! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_i8")
1046 : #endif
1047 : #endif
1048 :
1049 : ! Construct global sum from integer vector representation:
1050 : ! 1) arr_max_shift is the shift applied to fraction(arr_gmax) .
1051 : ! When shifting back, need to "add back in" true arr_gmax exponent. This was
1052 : ! removed implicitly by working only with the fraction .
1053 : ! 2) want to add levels into sum in reverse order (smallest to largest). However,
1054 : ! even this can generate floating point rounding errors if signs of integers
1055 : ! alternate. To avoid this, do some arithmetic with integer vectors so that all
1056 : ! components have the same sign. This should keep relative difference between
1057 : ! using different integer sizes (e.g. i8 and i4) to machine epsilon
1058 : ! 3) assignment to X_8 will usually lose accuracy since maximum integer
1059 : ! size is greater than the max number of 'digits' in r8 value (if xmax_nsummands
1060 : ! correction not very large). Calculate remainder and add in first (since
1061 : ! smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits).
1062 : ! For r4 (24 digits) may need to correct twice. Code is written in a general
1063 : ! fashion, to work no matter how many corrections are necessary (assuming
1064 : ! max_jlevel parameter calculation is correct).
1065 :
1066 0 : recompute = .false.
1067 0 : do ifld=1,nflds
1068 0 : arr_gsum(ifld) = 0.0_r8
1069 0 : ioffset = offset(ifld)
1070 :
1071 : ! if validate is .true., test whether the summand upper bound
1072 : ! was exceeded on any of the processes
1073 0 : if (validate) then
1074 0 : if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then
1075 0 : recompute = .true.
1076 : endif
1077 : endif
1078 :
1079 0 : if (.not. recompute) then
1080 :
1081 : ! preprocess integer vector:
1082 : ! a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap'
1083 : ! to next larger integer in vector, resulting in nonoverlapping ranges for each
1084 : ! component. Note that have "ilevel-1=0" level here as described above.
1085 0 : do ilevel=max_levels(ifld),1,-1
1086 0 : RX_8 = i8_arr_gsum_level(ioffset+ilevel)
1087 0 : IX_8 = int(scale(RX_8,-arr_max_shift),i8)
1088 0 : if (IX_8 .ne. 0_i8) then
1089 0 : i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) &
1090 0 : + IX_8
1091 0 : IX_8 = IX_8*(i8_radix**arr_max_shift)
1092 0 : i8_arr_gsum_level(ioffset+ilevel) = i8_arr_gsum_level(ioffset+ilevel) &
1093 0 : - IX_8
1094 : endif
1095 : enddo
1096 : ! b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary
1097 : ! so that all vector components have the same sign (eliminating loss
1098 : ! of accuracy arising from difference of large values when
1099 : ! reconstructing r8 sum from integer vector)
1100 0 : ilevel = 0
1101 0 : do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) &
1102 0 : .and. (ilevel < max_levels(ifld)))
1103 0 : ilevel = ilevel + 1
1104 : enddo
1105 : !
1106 0 : if (ilevel < max_levels(ifld)) then
1107 0 : if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then
1108 0 : i8_sign = 1_i8
1109 : else
1110 0 : i8_sign = -1_i8
1111 : endif
1112 0 : do jlevel=ilevel,max_levels(ifld)-1
1113 0 : if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) &
1114 0 : .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then
1115 0 : i8_arr_gsum_level(ioffset+jlevel) = i8_arr_gsum_level(ioffset+jlevel) &
1116 0 : - i8_sign
1117 0 : i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) &
1118 0 : + i8_sign*(i8_radix**arr_max_shift)
1119 : endif
1120 : enddo
1121 : endif
1122 :
1123 : ! start with maximum shift, and work up to larger values
1124 0 : arr_shift = arr_gmax_exp(ifld) &
1125 0 : - max_levels(ifld)*arr_max_shift
1126 0 : curr_exp = 0
1127 0 : first = .true.
1128 0 : do ilevel=max_levels(ifld),0,-1
1129 :
1130 0 : if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
1131 0 : jlevel = 1
1132 :
1133 : ! r8 representation of higher order bits in integer
1134 0 : X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel)
1135 0 : LX(jlevel) = exponent(X_8(jlevel))
1136 :
1137 : ! calculate remainder
1138 0 : IX_8 = int(X_8(jlevel),i8)
1139 0 : RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
1140 :
1141 : ! repeat using remainder
1142 0 : do while (RX_8 .ne. 0.0_r8)
1143 0 : jlevel = jlevel + 1
1144 0 : X_8(jlevel) = RX_8
1145 0 : LX(jlevel) = exponent(RX_8)
1146 0 : IX_8 = IX_8 + int(RX_8,i8)
1147 0 : RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
1148 : enddo
1149 :
1150 : ! add in contributions, smaller to larger, rescaling for each
1151 : ! addition to guarantee that exponent of working summand is always
1152 : ! larger than minexponent
1153 0 : do while (jlevel > 0)
1154 0 : if (first) then
1155 0 : curr_exp = LX(jlevel) + arr_shift
1156 0 : arr_gsum(ifld) = fraction(X_8(jlevel))
1157 0 : first = .false.
1158 : else
1159 0 : corr_exp = curr_exp - (LX(jlevel) + arr_shift)
1160 0 : arr_gsum(ifld) = fraction(X_8(jlevel)) &
1161 0 : + scale(arr_gsum(ifld),corr_exp)
1162 0 : curr_exp = LX(jlevel) + arr_shift
1163 : endif
1164 0 : jlevel = jlevel - 1
1165 : enddo
1166 :
1167 : endif
1168 :
1169 0 : arr_shift = arr_shift + arr_max_shift
1170 : enddo
1171 :
1172 : ! apply final exponent correction, scaling first if exponent is too small
1173 : ! to apply directly
1174 0 : corr_exp = curr_exp + exponent(arr_gsum(ifld))
1175 0 : if (corr_exp .ge. MINEXPONENT(1._r8)) then
1176 0 : arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp)
1177 : else
1178 0 : RX_8 = set_exponent(arr_gsum(ifld), &
1179 0 : corr_exp-MINEXPONENT(1._r8))
1180 0 : arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
1181 : endif
1182 :
1183 : ! if validate is .true. and some precision lost, test whether 'too much'
1184 : ! was lost, due to too loose an upper bound, too stringent a limit on number
1185 : ! of levels of expansion, cancellation, .... Calculated by comparing lower
1186 : ! bound on number of sigificant digits with number of digits in 1.0_r8 .
1187 0 : if (validate) then
1188 0 : if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then
1189 :
1190 : ! find first nonzero level and use exponent for this level, then assume all
1191 : ! subsequent levels contribute arr_max_shift digits.
1192 0 : sum_digits = 0
1193 0 : do ilevel=0,max_levels(ifld)
1194 0 : if (sum_digits .eq. 0) then
1195 0 : if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
1196 0 : X_8(1) = i8_arr_gsum_level(ioffset+ilevel)
1197 0 : LX(1) = exponent(X_8(1))
1198 0 : sum_digits = LX(1)
1199 : endif
1200 : else
1201 0 : sum_digits = sum_digits + arr_max_shift
1202 : endif
1203 : enddo
1204 :
1205 0 : if (sum_digits < digits(1.0_r8)) then
1206 0 : recompute = .true.
1207 : endif
1208 : endif
1209 : endif
1210 :
1211 : endif
1212 :
1213 : enddo
1214 :
1215 :
1216 0 : end subroutine ice_reprosum_int
1217 :
1218 : !========================================================================
1219 : !
1220 : ! Purpose:
1221 : !> Test whether distributed sum exceeds tolerance and print out a
1222 : !> warning message.
1223 : !
1224 : !----------------------------------------------------------------------
1225 :
1226 0 : logical function ice_reprosum_tolExceeded (name, nflds, master, &
1227 0 : logunit, rel_diff )
1228 : !----------------------------------------------------------------------
1229 :
1230 : ! Arguments
1231 :
1232 : character(len=*), intent(in) :: name !< distributed sum identifier
1233 : integer, intent(in) :: nflds !< number of fields
1234 : logical, intent(in) :: master !< process that will write
1235 : !< warning messages?
1236 : integer, optional, intent(in) :: logunit!< unit warning messages
1237 : !< written to
1238 : real(r8), intent(in) :: rel_diff(2,nflds)
1239 : !< relative and absolute
1240 : !< differences between fixed
1241 : !< and floating point sums
1242 :
1243 : ! Local workspace
1244 :
1245 : integer :: llogunit ! local log unit
1246 : integer :: ifld ! field index
1247 : integer :: exceeds_limit ! number of fields whose
1248 : ! sum exceeds tolerance
1249 0 : real(r8) :: max_rel_diff ! maximum relative difference
1250 : integer :: max_rel_diff_idx ! field index for max. rel. diff.
1251 0 : real(r8) :: max_abs_diff ! maximum absolute difference
1252 : integer :: max_abs_diff_idx ! field index for max. abs. diff.
1253 : character(len=*),parameter :: subname = '(ice_reprosum_tolExceeded)'
1254 :
1255 : !-----------------------------------------------------------------------
1256 :
1257 0 : ice_reprosum_tolExceeded = .false.
1258 0 : if (ice_reprosum_reldiffmax < 0.0_r8) return
1259 :
1260 0 : if ( present(logunit) ) then
1261 0 : llogunit = logunit
1262 : else
1263 0 : llogunit = nu_diag
1264 : endif
1265 :
1266 : ! check that "fast" reproducible sum is accurate enough.
1267 0 : exceeds_limit = 0
1268 0 : max_rel_diff = 0.0_r8
1269 0 : max_abs_diff = 0.0_r8
1270 0 : do ifld=1,nflds
1271 0 : if (rel_diff(1,ifld) > ice_reprosum_reldiffmax) then
1272 0 : exceeds_limit = exceeds_limit + 1
1273 0 : if (rel_diff(1,ifld) > max_rel_diff) then
1274 0 : max_rel_diff = rel_diff(1,ifld)
1275 0 : max_rel_diff_idx = ifld
1276 : endif
1277 0 : if (rel_diff(2,ifld) > max_abs_diff) then
1278 0 : max_abs_diff = rel_diff(2,ifld)
1279 0 : max_abs_diff_idx = ifld
1280 : endif
1281 : endif
1282 : enddo
1283 :
1284 0 : if (exceeds_limit > 0) then
1285 0 : if (master) then
1286 0 : write(llogunit,*) subname,trim(name), &
1287 : ': difference in fixed and floating point sums ', & ! LCOV_EXCL_LINE
1288 : ' exceeds tolerance in ', exceeds_limit, & ! LCOV_EXCL_LINE
1289 0 : ' fields.'
1290 0 : write(llogunit,*) subname,' Maximum relative diff: (rel)', &
1291 : rel_diff(1,max_rel_diff_idx), ' (abs) ', & ! LCOV_EXCL_LINE
1292 0 : rel_diff(2,max_rel_diff_idx)
1293 0 : write(llogunit,*) subname,' Maximum absolute diff: (rel)', &
1294 : rel_diff(1,max_abs_diff_idx), ' (abs) ', & ! LCOV_EXCL_LINE
1295 0 : rel_diff(2,max_abs_diff_idx)
1296 : endif
1297 0 : ice_reprosum_tolExceeded = .true.
1298 : endif
1299 :
1300 :
1301 0 : end function ice_reprosum_tolExceeded
1302 :
1303 : !========================================================================
1304 : !
1305 : ! Purpose:
1306 : !> Compute the global sum of each field in "arr" using the indicated
1307 : !> communicator with a reproducible yet scalable implementation based
1308 : !> on He and Ding's implementation of the double-double algorithm.
1309 : !
1310 : !----------------------------------------------------------------------
1311 :
1312 0 : subroutine ice_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, &
1313 : nflds, mpi_comm )
1314 : !----------------------------------------------------------------------
1315 :
1316 : ! Arguments
1317 :
1318 : integer, intent(in) :: nsummands !< number of local summands
1319 : integer, intent(in) :: dsummands !< declared first dimension
1320 : integer, intent(in) :: nflds !< number of fields
1321 : real(r8), intent(in) :: arr(dsummands,nflds)
1322 : !< input array
1323 : integer, intent(in) :: mpi_comm !< MPI subcommunicator
1324 :
1325 : real(r8), intent(out):: arr_gsum(nflds)
1326 : !< global sums
1327 :
1328 :
1329 : ! Local workspace
1330 :
1331 : integer :: old_cw ! for x86 processors, save
1332 : ! current arithmetic mode
1333 : integer :: ifld, isum ! loop variables
1334 : integer :: ierr ! MPI error return
1335 :
1336 0 : real(r8) :: e, t1, t2 ! temporaries
1337 0 : complex(r8) :: arr_lsum_dd(nflds) ! local sums (in double-double
1338 : ! format)
1339 0 : complex(r8) :: arr_gsum_dd(nflds) ! global sums (in double-double
1340 : ! format)
1341 :
1342 : integer, save :: mpi_sumdd
1343 : logical, save :: first_time = .true.
1344 : character(len=*),parameter :: subname = '(ice_reprosum_ddpdd)'
1345 :
1346 : !-----------------------------------------------------------------------
1347 :
1348 0 : call ice_shr_reprosumx86_fix_start (old_cw)
1349 :
1350 0 : if (first_time) then
1351 : #ifdef SERIAL_REMOVE_MPI
1352 : mpi_sumdd = 0
1353 : #else
1354 0 : call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
1355 : #endif
1356 0 : first_time = .false.
1357 : endif
1358 :
1359 0 : do ifld=1,nflds
1360 0 : arr_lsum_dd(ifld) = (0.0_r8,0.0_r8)
1361 :
1362 0 : do isum=1,nsummands
1363 :
1364 : ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s
1365 : ! trick.
1366 0 : t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
1367 0 : e = t1 - arr(isum,ifld)
1368 0 : t2 = ((real(arr_lsum_dd(ifld)) - e) &
1369 : + (arr(isum,ifld) - (t1 - e))) & ! LCOV_EXCL_LINE
1370 0 : + aimag(arr_lsum_dd(ifld))
1371 :
1372 : ! The result is t1 + t2, after normalization.
1373 0 : arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
1374 : enddo
1375 :
1376 : enddo
1377 :
1378 : #ifdef SERIAL_REMOVE_MPI
1379 : arr_gsum_dd = arr_lsum_dd
1380 : #else
1381 : call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
1382 0 : MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
1383 : #endif
1384 0 : do ifld=1,nflds
1385 0 : arr_gsum(ifld) = real(arr_gsum_dd(ifld))
1386 : enddo
1387 :
1388 0 : call ice_shr_reprosumx86_fix_end (old_cw)
1389 :
1390 0 : end subroutine ice_reprosum_ddpdd
1391 :
1392 : !-----------------------------------------------------------------------
1393 :
1394 0 : subroutine DDPDD (dda, ddb, len, itype)
1395 : !----------------------------------------------------------------------
1396 : !
1397 : ! Purpose:
1398 : ! Modification of original codes written by David H. Bailey
1399 : ! This subroutine computes ddb(i) = dda(i)+ddb(i)
1400 : !
1401 : !----------------------------------------------------------------------
1402 :
1403 : ! Arguments
1404 :
1405 : integer, intent(in) :: len ! array length
1406 : complex(r8), intent(in) :: dda(len) ! input
1407 : complex(r8), intent(inout) :: ddb(len) ! result
1408 : integer, intent(in) :: itype ! unused
1409 :
1410 : ! Local workspace
1411 :
1412 0 : real(r8) e, t1, t2
1413 : integer i
1414 : character(len=*),parameter :: subname = '(ice_reprosum_mod:DDPDD)'
1415 :
1416 : !-----------------------------------------------------------------------
1417 :
1418 0 : do i = 1, len
1419 : ! Compute dda + ddb using Knuth's trick.
1420 0 : t1 = real(dda(i)) + real(ddb(i))
1421 0 : e = t1 - real(dda(i))
1422 0 : t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
1423 0 : + aimag(dda(i)) + aimag(ddb(i))
1424 :
1425 : ! The result is t1 + t2, after normalization.
1426 0 : ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
1427 : enddo
1428 :
1429 :
1430 0 : end subroutine DDPDD
1431 :
1432 : !-----------------------------------------------------------------------
1433 :
1434 0 : subroutine split_indices(total,num_pieces,ibeg,iend)
1435 : !----------------------------------------------------------------------
1436 : !
1437 : ! Purpose:
1438 : ! Split range into 'num_pieces'
1439 : !
1440 : !----------------------------------------------------------------------
1441 :
1442 : ! Arguments
1443 :
1444 : integer, intent(in) :: total
1445 : integer, intent(in) :: num_pieces
1446 : integer, intent(out) :: ibeg(num_pieces), iend(num_pieces)
1447 :
1448 : ! Local workspace
1449 :
1450 : integer :: itmp1, itmp2, ioffset, i
1451 : character(len=*),parameter :: subname = '(ice_reprosum_mod:split_indices)'
1452 :
1453 : !-----------------------------------------------------------------------
1454 :
1455 0 : itmp1 = total/num_pieces
1456 0 : itmp2 = mod(total,num_pieces)
1457 0 : ioffset = 0
1458 0 : do i=1,itmp2
1459 0 : ibeg(i) = ioffset + 1
1460 0 : iend(i) = ioffset + (itmp1+1)
1461 0 : ioffset = iend(i)
1462 : enddo
1463 0 : do i=itmp2+1,num_pieces
1464 0 : ibeg(i) = ioffset + 1
1465 0 : if (ibeg(i) > total) then
1466 0 : iend(i) = ibeg(i) - 1
1467 : else
1468 0 : iend(i) = ioffset + itmp1
1469 0 : ioffset = iend(i)
1470 : endif
1471 : enddo
1472 :
1473 0 : end subroutine split_indices
1474 :
1475 : !========================================================================
1476 :
1477 : end module ice_reprosum
|