Line data Source code
1 :
2 : !> Reproducible sum method from P. Worley
3 : #define SERIAL_REMOVE_MPI
4 :
5 : MODULE ice_reprosum
6 :
7 : !-----------------------------------------------------------------------
8 : !
9 : ! Purpose:
10 : !> Compute reproducible global sums of a set of arrays across an MPI
11 : !> subcommunicator
12 : !
13 : ! Methods:
14 : !> Compute using either or both a scalable, reproducible algorithm and a
15 : !> scalable, nonreproducible algorithm:
16 : !> * Reproducible (scalable):
17 : !> Convert to fixed point (integer vector representation) to enable
18 : !> reproducibility when using MPI_Allreduce
19 : !> * Alternative usually reproducible (scalable):
20 : !> Use parallel double-double algorithm due to Helen He and
21 : !> Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
22 : !> * Nonreproducible (scalable):
23 : !> Floating point and MPI_Allreduce based.
24 : !> If computing both reproducible and nonreproducible sums, compare
25 : !> these and report relative difference (if absolute difference
26 : !> less than sum) or absolute difference back to calling routine.
27 : !
28 : ! Author: P. Worley (based on suggestions from J. White for fixed
29 : ! point algorithm and on He/Ding paper for ddpdd
30 : ! algorithm)
31 : !
32 : ! Modified by T.Craig for CICE, March 2019 based on the public version in
33 : ! Oasis3-MCT_4.0.
34 : !
35 : !-----------------------------------------------------------------------
36 :
37 : !-----------------------------------------------------------------------
38 : !- use statements ------------------------------------------------------
39 : !-----------------------------------------------------------------------
40 : #ifndef SERIAL_REMOVE_MPI
41 : use mpi ! MPI Fortran module
42 : #endif
43 : #if defined (NO_I8)
44 : ! Workaround for when shr_kind_i8 is not supported.
45 : use ice_kinds_mod, only: r8 => dbl_kind, i8 => int_kind
46 : #else
47 : use ice_kinds_mod, only: r8 => dbl_kind, i8 => int8_kind
48 : #endif
49 : use ice_kinds_mod, only: char_len_long
50 : use ice_fileunits, only: nu_diag
51 : use ice_exit, only: abort_ice
52 :
53 : ! internal timers not yet implemented, need to revisit if needed
54 : ! use ice_mpi, only: xicex_mpi_barrier
55 : ! use ice_timer, only: xicex_timer_start, xicex_timer_stop
56 :
57 : !-----------------------------------------------------------------------
58 : !- module boilerplate --------------------------------------------------
59 : !-----------------------------------------------------------------------
60 : implicit none
61 : private
62 :
63 : !-----------------------------------------------------------------------
64 : ! Public interfaces ----------------------------------------------------
65 : !-----------------------------------------------------------------------
66 : public :: &
67 : ice_reprosum_setopts, &! set runtime options ! LCOV_EXCL_LINE
68 : ice_reprosum_calc, &! calculate distributed sum ! LCOV_EXCL_LINE
69 : ice_reprosum_tolExceeded ! utility function to check relative
70 : ! differences against the tolerance
71 :
72 : !-----------------------------------------------------------------------
73 : ! Public data ----------------------------------------------------------
74 : !-----------------------------------------------------------------------
75 : logical, public :: ice_reprosum_recompute = .false.
76 :
77 : real(r8), public :: ice_reprosum_reldiffmax = -1.0_r8
78 :
79 : !-----------------------------------------------------------------------
80 : ! Private interfaces ---------------------------------------------------
81 : !-----------------------------------------------------------------------
82 : private :: &
83 : ddpdd, &! double-double sum routine ! LCOV_EXCL_LINE
84 : split_indices ! split indices among OMP threads
85 :
86 : !-----------------------------------------------------------------------
87 : ! Private data ---------------------------------------------------------
88 : !-----------------------------------------------------------------------
89 :
90 : logical :: repro_sum_use_ddpdd = .false.
91 : logical :: detailed_timing = .false.
92 : character(len=char_len_long) :: tmpstr
93 :
94 : CONTAINS
95 :
96 : !========================================================================
97 : !-----------------------------------------------------------------------
98 : ! Purpose:
99 : !> Set runtime options
100 : ! Author: P. Worley
101 : !-----------------------------------------------------------------------
102 :
103 0 : subroutine ice_reprosum_setopts(repro_sum_use_ddpdd_in, &
104 : repro_sum_rel_diff_max_in, & ! LCOV_EXCL_LINE
105 : repro_sum_recompute_in, & ! LCOV_EXCL_LINE
106 : repro_sum_master, & ! LCOV_EXCL_LINE
107 : repro_sum_logunit )
108 :
109 : !------------------------------Arguments--------------------------------
110 : logical, intent(in), optional :: repro_sum_use_ddpdd_in
111 : !< Use DDPDD algorithm instead of fixed precision algorithm
112 : real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
113 : !< maximum permissible difference between reproducible and
114 : !< nonreproducible sums
115 : logical, intent(in), optional :: repro_sum_recompute_in
116 : !< recompute using different algorithm when difference between
117 : !< reproducible and nonreproducible sums is too great
118 : logical, intent(in), optional :: repro_sum_master
119 : !< flag indicating whether this process should output
120 : !< log messages
121 : integer, intent(in), optional :: repro_sum_logunit
122 : !< unit number for log messages
123 : !---------------------------Local Workspace-----------------------------
124 : integer llogunit ! unit number for log messages
125 : logical master ! local master?
126 : logical,save :: firstcall = .true. ! first call
127 : character(len=*),parameter :: subname = '(ice_reprosum_setopts)'
128 : !-----------------------------------------------------------------------
129 :
130 0 : if ( present(repro_sum_master) ) then
131 0 : master = repro_sum_master
132 : else
133 0 : master = .false.
134 : endif
135 :
136 0 : if ( present(repro_sum_logunit) ) then
137 0 : llogunit = repro_sum_logunit
138 : else
139 0 : llogunit = nu_diag
140 : endif
141 :
142 0 : if (.not. firstcall) then
143 0 : write(tmpstr,*) subname//' ERROR: can only be called once'
144 0 : call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
145 : endif
146 0 : firstcall = .false.
147 :
148 0 : if ( present(repro_sum_use_ddpdd_in) ) then
149 0 : repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
150 : endif
151 0 : if ( present(repro_sum_rel_diff_max_in) ) then
152 0 : ice_reprosum_reldiffmax = repro_sum_rel_diff_max_in
153 : endif
154 0 : if ( present(repro_sum_recompute_in) ) then
155 0 : ice_reprosum_recompute = repro_sum_recompute_in
156 : endif
157 0 : if (master) then
158 0 : if ( repro_sum_use_ddpdd ) then
159 0 : write(llogunit,*) subname, &
160 : 'Using double-double-based (scalable) usually reproducible ', & ! LCOV_EXCL_LINE
161 0 : 'distributed sum algorithm'
162 : else
163 0 : write(llogunit,*) subname, &
164 : 'Using fixed-point-based (scalable) reproducible ', & ! LCOV_EXCL_LINE
165 0 : 'distributed sum algorithm'
166 : endif
167 :
168 0 : if (ice_reprosum_reldiffmax >= 0._r8) then
169 0 : write(llogunit,*) subname, &
170 : ' with a maximum relative error tolerance of ', & ! LCOV_EXCL_LINE
171 0 : ice_reprosum_reldiffmax
172 0 : if (ice_reprosum_recompute) then
173 0 : write(llogunit,*) subname, &
174 : 'If tolerance exceeded, sum is recomputed using ', & ! LCOV_EXCL_LINE
175 0 : 'a serial algorithm.'
176 : else
177 0 : write(llogunit,*) subname, &
178 : 'If tolerance exceeded, fixed-precision is sum used ', & ! LCOV_EXCL_LINE
179 0 : 'but a warning is output.'
180 : endif
181 : else
182 0 : write(llogunit,*) subname, &
183 0 : 'and not comparing with floating point algorithms.'
184 : endif
185 :
186 : endif
187 0 : end subroutine ice_reprosum_setopts
188 :
189 : !========================================================================
190 :
191 : !
192 : ! Purpose:
193 : !> Compute the global sum of each field in "arr" using the indicated
194 : !> communicator with a reproducible yet scalable implementation based
195 : !> on a fixed point algorithm. An alternative is to use an "almost
196 : !> always reproducible" floating point algorithm.
197 : !
198 : ! The accuracy of the fixed point algorithm is controlled by the
199 : ! number of "levels" of integer expansion. The algorithm will calculate
200 : ! the number of levels that is required for the sum to be essentially
201 : ! exact. The optional parameter arr_max_levels can be used to override
202 : ! the calculated value. The optional parameter arr_max_levels_out can be
203 : ! used to return the values used.
204 : !
205 : ! The algorithm also requires an upper bound on
206 : ! the maximum summand (in absolute value) for each field, and will
207 : ! calculate this internally. However, if the optional parameters
208 : ! arr_max_levels and arr_gbl_max are both set, then the algorithm will
209 : ! use the values in arr_gbl_max for the upper bounds instead. If these
210 : ! are not upper bounds, or if the upper bounds are not tight enough
211 : ! to achieve the requisite accuracy, and if the optional parameter
212 : ! repro_sum_validate is NOT set to .false., the algorithm will repeat the
213 : ! computation with appropriate upper bounds. If only arr_gbl_max is present,
214 : ! then the maxima are computed internally (and the specified values are
215 : ! ignored). The optional parameter arr_gbl_max_out can be
216 : ! used to return the values used.
217 : !
218 : ! Finally, the algorithm requires an upper bound on the number of
219 : ! local summands across all processes. This will be calculated internally,
220 : ! using an MPI collective, but the value in the optional argument
221 : ! gbl_max_nsummands will be used instead if (1) it is present, (2)
222 : ! it is > 0, and (3) the maximum value and required number of levels
223 : ! are also specified. (If the maximum value is calculated, the same
224 : ! MPI collective is used to determine the maximum number of local
225 : ! summands.) The accuracy of the user-specified value is not checked.
226 : ! However, if set to < 1, the value will instead be calculated. If the
227 : ! optional parameter gbl_max_nsummands_out is present, then the value
228 : ! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
229 : ! returned.
230 : !
231 : ! If requested (by setting ice_reprosum_reldiffmax >= 0.0 and passing in
232 : ! the optional rel_diff parameter), results are compared with a
233 : ! nonreproducible floating point algorithm.
234 : !
235 : ! Note that the cost of the algorithm is not strongly correlated with
236 : ! the number of levels, which primarily shows up as a (modest) increase
237 : ! in cost of the MPI_Allreduce as a function of vector length. Rather the
238 : ! cost is more a function of (a) the number of integers required to
239 : ! represent an individual summand and (b) the number of MPI_Allreduce
240 : ! calls. The number of integers required to represent an individual
241 : ! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
242 : ! when the number of local summands is not too large. As the number of
243 : ! local summands increases, the number of integers required increases.
244 : ! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
245 : ! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
246 : ! correctly). When specifying arr_max_levels and arr_gbl_max
247 : ! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
248 : !
249 : ! The alternative algorithm is a minor modification of a parallel
250 : ! implementation of David Bailey's routine DDPDD by Helen He
251 : ! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
252 : ! precision summation of double precision values with 10 double
253 : ! precision operations. The advantage of this algorithm is that
254 : ! it requires a single MPI_Allreduce and is less expensive per summand
255 : ! than is the fixed precision algorithm. The disadvantage is that it
256 : ! is not guaranteed to be reproducible (though it is reproducible
257 : ! much more often than is the standard algorithm). This alternative
258 : ! is used when the optional parameter ddpdd_sum is set to .true. It is
259 : ! also used if the fixed precision algorithm radix assumption does not
260 : ! hold.
261 :
262 : !----------------------------------------------------------------------
263 0 : subroutine ice_reprosum_calc (arr, arr_gsum, nsummands, dsummands, &
264 : nflds, ddpdd_sum, & ! LCOV_EXCL_LINE
265 : arr_gbl_max, arr_gbl_max_out, & ! LCOV_EXCL_LINE
266 : arr_max_levels, arr_max_levels_out, & ! LCOV_EXCL_LINE
267 : gbl_max_nsummands, gbl_max_nsummands_out,& ! LCOV_EXCL_LINE
268 : gbl_count, repro_sum_validate, & ! LCOV_EXCL_LINE
269 0 : repro_sum_stats, rel_diff, commid )
270 : !----------------------------------------------------------------------
271 :
272 : ! Arguments
273 :
274 : integer, intent(in) :: nsummands !< number of local summands
275 : integer, intent(in) :: dsummands !< declared first dimension
276 : integer, intent(in) :: nflds !< number of fields
277 : real(r8), intent(in) :: arr(dsummands,nflds)
278 : !< input array
279 :
280 : real(r8), intent(out):: arr_gsum(nflds)
281 : !< global means
282 :
283 : logical, intent(in), optional :: ddpdd_sum
284 : !< use ddpdd algorithm instead
285 : !< of fixed precision algorithm
286 :
287 : real(r8), intent(in), optional :: arr_gbl_max(nflds)
288 : !< upper bound on max(abs(arr))
289 :
290 : real(r8), intent(out), optional :: arr_gbl_max_out(nflds)
291 : !< calculated upper bound on
292 : !< max(abs(arr))
293 :
294 : integer, intent(in), optional :: arr_max_levels(nflds)
295 : !< maximum number of levels of
296 : !< integer expansion to use
297 :
298 : integer, intent(out), optional :: arr_max_levels_out(nflds)
299 : !< output of number of levels of
300 : !< integer expansion to used
301 :
302 : integer, intent(in), optional :: gbl_max_nsummands
303 : !< maximum of nsummand over all
304 : !< processes
305 :
306 : integer, intent(out), optional :: gbl_max_nsummands_out
307 : !< calculated maximum nsummands
308 : !< over all processes
309 :
310 : integer, intent(in), optional :: gbl_count
311 : !< was total number of summands;
312 : !< now is ignored; use
313 : !< gbl_max_nsummands instead
314 :
315 : logical, intent(in), optional :: repro_sum_validate
316 : !< flag enabling/disabling testing that gmax and max_levels are
317 : !< accurate/sufficient. Default is enabled.
318 :
319 : integer, intent(inout), optional :: repro_sum_stats(5)
320 : !< increment running totals for
321 : !< (1) one-reduction repro_sum
322 : !< (2) two-reduction repro_sum
323 : !< (3) both types in one call
324 : !< (4) nonrepro_sum
325 : !< (5) global max nsummands reduction
326 :
327 : real(r8), intent(out), optional :: rel_diff(2,nflds)
328 : !< relative and absolute
329 : !< differences between fixed
330 : !< and floating point sums
331 :
332 : integer, intent(in), optional :: commid
333 : !< MPI communicator
334 :
335 : ! Local workspace
336 :
337 : logical :: use_ddpdd_sum ! flag indicating whether to
338 : ! use ice_reprosum_ddpdd or not
339 : logical :: recompute ! flag indicating need to
340 : ! determine gmax/gmin before
341 : ! computing sum
342 : logical :: validate ! flag indicating need to
343 : ! verify gmax and max_levels
344 : ! are accurate/sufficient
345 : integer :: omp_nthreads ! number of OpenMP threads
346 : integer :: mpi_comm ! MPI subcommunicator
347 : integer :: tasks ! number of MPI processes
348 : integer :: mype ! MPI task rank
349 : integer :: ierr ! MPI error return
350 : integer :: ifld, isum, ithread ! loop variables
351 : integer :: max_nsummands ! max nsummands over all processes
352 : ! or threads (used in both ways)
353 :
354 0 : integer, allocatable :: isum_beg(:), isum_end(:)
355 : ! range of summand indices for each
356 : ! OpenMP thread
357 0 : integer, allocatable :: arr_tlmin_exp(:,:)
358 : ! per thread local exponent minima
359 0 : integer, allocatable :: arr_tlmax_exp(:,:)
360 : ! per thread local exponent maxima
361 : integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
362 : ! summand exponent and working min/max
363 0 : integer :: arr_lmin_exp(nflds) ! local exponent minima
364 0 : integer :: arr_lmax_exp(nflds) ! local exponent maxima
365 0 : integer :: arr_lextremes(0:nflds,2)! local exponent extrema
366 0 : integer :: arr_gextremes(0:nflds,2)! global exponent extrema
367 :
368 0 : integer :: arr_gmax_exp(nflds) ! global exponents maxima
369 0 : integer :: arr_gmin_exp(nflds) ! global exponents minima
370 : integer :: arr_max_shift ! maximum safe exponent for
371 : ! value < 1 (so that sum does
372 : ! not overflow)
373 0 : integer :: max_levels(nflds) ! maximum number of levels of
374 : ! integer expansion to use
375 : integer :: max_level ! maximum value in max_levels
376 : integer :: gbl_max_red ! global max local sum reduction? (0/1)
377 : integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1)
378 : integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1)
379 : integer :: repro_sum_both ! both fast and slow? (0/1)
380 : integer :: nonrepro_sum ! nonrepro_sum? (0/1)
381 :
382 : real(r8) :: xmax_nsummands ! dble of max_nsummands
383 0 : real(r8) :: arr_lsum(nflds) ! local sums
384 0 : real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using
385 : ! fast, nonreproducible,
386 : ! floating point alg.
387 : real(r8) :: abs_diff ! absolute difference between
388 : ! fixed and floating point
389 : ! sums
390 : #ifdef _OPENMP
391 : integer omp_get_max_threads
392 : external omp_get_max_threads
393 : #endif
394 : character(len=*),parameter :: subname = '(ice_reprosum_calc)'
395 :
396 : !-----------------------------------------------------------------------
397 :
398 : ! check whether should use ice_reprosum_ddpdd algorithm
399 0 : use_ddpdd_sum = repro_sum_use_ddpdd
400 0 : if ( present(ddpdd_sum) ) then
401 0 : use_ddpdd_sum = ddpdd_sum
402 : endif
403 :
404 : ! check whether intrinsic-based algorithm will work on this system
405 : ! (requires floating point and integer bases to be the same)
406 : ! If not, always use ddpdd.
407 0 : use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
408 :
409 : ! initialize local statistics variables
410 0 : gbl_max_red = 0
411 0 : repro_sum_fast = 0
412 0 : repro_sum_slow = 0
413 0 : repro_sum_both = 0
414 0 : nonrepro_sum = 0
415 :
416 : ! set MPI communicator
417 0 : if ( present(commid) ) then
418 0 : mpi_comm = commid
419 : else
420 : #ifdef SERIAL_REMOVE_MPI
421 0 : mpi_comm = 0
422 : #else
423 : mpi_comm = MPI_COMM_WORLD
424 : #endif
425 : endif
426 :
427 : ! if (detailed_timing) then
428 : ! call xicex_timer_start('xicex_reprosum_prebarrier')
429 : ! call xicex_mpi_barrier(mpi_comm,subname)
430 : ! call xicex_timer_stop ('xicex_reprosum_prebarrier')
431 : ! endif
432 :
433 0 : if ( use_ddpdd_sum ) then
434 :
435 : ! if (detailed_timing) call xicex_timer_start('ice_reprosum_ddpdd')
436 :
437 : call ice_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
438 0 : nflds, mpi_comm)
439 0 : repro_sum_fast = 1
440 :
441 : ! if (detailed_timing) call xicex_timer_stop('ice_reprosum_ddpdd')
442 :
443 : else
444 :
445 : ! if (detailed_timing) call xicex_timer_start('ice_reprosum_int')
446 :
447 : ! get number of MPI tasks
448 : #ifdef SERIAL_REMOVE_MPI
449 0 : tasks = 1
450 0 : mype = 0
451 : #else
452 : call mpi_comm_size(mpi_comm, tasks, ierr)
453 : call mpi_comm_rank(mpi_comm, mype, ierr)
454 : #endif
455 :
456 : ! get number of OpenMP threads
457 : #ifdef _OPENMP
458 : omp_nthreads = omp_get_max_threads()
459 : #else
460 0 : omp_nthreads = 1
461 : #endif
462 :
463 : ! see if have sufficient information to not require max/min allreduce
464 0 : recompute = .true.
465 0 : validate = .false.
466 0 : if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
467 0 : recompute = .false.
468 :
469 : ! setting lower bound on max_level*nflds to be 64 to improve OpenMP
470 : ! performance for loopb in ice_reprosum_int
471 0 : max_level = (64/nflds) + 1
472 0 : do ifld=1,nflds
473 0 : if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
474 0 : (arr_max_levels(ifld) > 0)) then
475 :
476 0 : arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld))
477 0 : if (max_level < arr_max_levels(ifld)) &
478 0 : max_level = arr_max_levels(ifld)
479 :
480 : else
481 0 : recompute = .true.
482 : endif
483 : enddo
484 :
485 0 : if (.not. recompute) then
486 :
487 : ! determine maximum number of summands in local phases of the
488 : ! algorithm
489 : ! if (detailed_timing) call xicex_timer_start("repro_sum_allr_max")
490 0 : if ( present(gbl_max_nsummands) ) then
491 0 : if (gbl_max_nsummands < 1) then
492 : #ifdef SERIAL_REMOVE_MPI
493 0 : max_nsummands = nsummands
494 : #else
495 : call mpi_allreduce (nsummands, max_nsummands, 1, &
496 : MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
497 : #endif
498 0 : gbl_max_red = 1
499 : else
500 0 : max_nsummands = gbl_max_nsummands
501 : endif
502 : else
503 : #ifdef SERIAL_REMOVE_MPI
504 0 : max_nsummands = nsummands
505 : #else
506 : call mpi_allreduce (nsummands, max_nsummands, 1, &
507 : MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
508 : #endif
509 0 : gbl_max_red = 1
510 : endif
511 : ! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_max")
512 :
513 : ! determine maximum shift. Shift needs to be small enough that summation
514 : ! does not exceed maximum number of digits in i8.
515 :
516 : ! if requested, return max_nsummands before it is redefined
517 0 : if ( present( gbl_max_nsummands_out) ) then
518 0 : gbl_max_nsummands_out = max_nsummands
519 : endif
520 :
521 : ! summing within each thread first
522 0 : max_nsummands = (max_nsummands/omp_nthreads) + 1
523 : ! then over threads and tasks
524 0 : max_nsummands = max(max_nsummands, tasks*omp_nthreads)
525 :
526 0 : xmax_nsummands = dble(max_nsummands)
527 0 : arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
528 0 : if (arr_max_shift < 2) then
529 0 : write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
530 0 : call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
531 : endif
532 :
533 : ! calculate sum
534 0 : if (present(repro_sum_validate)) then
535 0 : validate = repro_sum_validate
536 : else
537 0 : validate = .true.
538 : endif
539 : call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
540 : nflds, arr_max_shift, arr_gmax_exp, & ! LCOV_EXCL_LINE
541 : arr_max_levels, max_level, validate, & ! LCOV_EXCL_LINE
542 0 : recompute, omp_nthreads, mpi_comm)
543 :
544 : ! record statistics, etc.
545 0 : repro_sum_fast = 1
546 0 : if (recompute) then
547 0 : repro_sum_both = 1
548 : else
549 : ! if requested, return specified levels and upper bounds on maxima
550 0 : if ( present(arr_max_levels_out) ) then
551 0 : do ifld=1,nflds
552 0 : arr_max_levels_out(ifld) = arr_max_levels(ifld)
553 : enddo
554 : endif
555 0 : if ( present(arr_gbl_max_out) ) then
556 0 : do ifld=1,nflds
557 0 : arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
558 : enddo
559 : endif
560 : endif
561 : endif
562 : endif
563 :
564 : ! do not have sufficient information; calculate global max/min and
565 : ! use to compute required number of levels
566 0 : if (recompute) then
567 :
568 : ! record statistic
569 0 : repro_sum_slow = 1
570 :
571 : ! determine maximum and minimum (non-zero) summand values and
572 : ! maximum number of local summands
573 :
574 : ! allocate thread-specific work space
575 0 : allocate(arr_tlmax_exp(nflds,omp_nthreads))
576 0 : allocate(arr_tlmin_exp(nflds,omp_nthreads))
577 0 : allocate(isum_beg(omp_nthreads))
578 0 : allocate(isum_end(omp_nthreads))
579 :
580 : ! split summand index range over OpenMP threads
581 0 : call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
582 :
583 : !$omp parallel do &
584 : !$omp default(shared) & ! LCOV_EXCL_LINE
585 : !$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
586 0 : do ithread=1,omp_nthreads
587 : ! if (detailed_timing) call xicex_timer_start('repro_sum_loopa')
588 0 : do ifld=1,nflds
589 0 : arr_exp_tlmin = MAXEXPONENT(1._r8)
590 0 : arr_exp_tlmax = MINEXPONENT(1._r8)
591 0 : do isum=isum_beg(ithread),isum_end(ithread)
592 0 : if (arr(isum,ifld) .ne. 0.0_r8) then
593 0 : arr_exp = exponent(arr(isum,ifld))
594 0 : arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
595 0 : arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
596 : endif
597 : end do
598 0 : arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
599 0 : arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
600 : end do
601 : ! if (detailed_timing) call xicex_timer_stop('repro_sum_loopa')
602 : end do
603 :
604 0 : do ifld=1,nflds
605 0 : arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
606 0 : arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
607 : end do
608 0 : deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
609 :
610 0 : arr_lextremes(0,:) = -nsummands
611 0 : arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
612 0 : arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
613 : ! if (detailed_timing) call xicex_timer_start("repro_sum_allr_minmax")
614 : #ifdef SERIAL_REMOVE_MPI
615 0 : arr_gextremes = arr_lextremes
616 : #else
617 : call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
618 : MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
619 : #endif
620 : ! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_minmax")
621 0 : max_nsummands = -arr_gextremes(0,1)
622 0 : arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
623 0 : arr_gmin_exp(:) = arr_gextremes(1:nflds,2)
624 :
625 : ! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
626 : ! and arr_gmax_exp still equals MINEXPONENT. In this case, set
627 : ! arr_gmin_exp = arr_gmax_exp = MINEXPONENT
628 0 : do ifld=1,nflds
629 0 : arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
630 : enddo
631 :
632 : ! if requested, return upper bounds on observed maxima
633 0 : if ( present(arr_gbl_max_out) ) then
634 0 : do ifld=1,nflds
635 0 : arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
636 : enddo
637 : endif
638 :
639 : ! if requested, return max_nsummands before it is redefined
640 0 : if ( present( gbl_max_nsummands_out) ) then
641 0 : gbl_max_nsummands_out = max_nsummands
642 : endif
643 :
644 : ! determine maximum shift (same as in previous branch, but with calculated
645 : ! max_nsummands). Shift needs to be small enough that summation does not
646 : ! exceed maximum number of digits in i8.
647 :
648 : ! summing within each thread first
649 0 : max_nsummands = (max_nsummands/omp_nthreads) + 1
650 : ! then over threads and tasks
651 0 : max_nsummands = max(max_nsummands, tasks*omp_nthreads)
652 :
653 0 : xmax_nsummands = dble(max_nsummands)
654 0 : arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
655 0 : if (arr_max_shift < 2) then
656 0 : write(tmpstr,*) subname//' ERROR: number of summands too large for fixed precision algorithm'
657 0 : call abort_ice(tmpstr,file=__FILE__,line=__LINE__)
658 : endif
659 :
660 : ! determine maximum number of levels required for each field
661 : ! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
662 : ! + 1 because first truncation probably does not involve a maximal shift
663 : ! + 1 to guarantee that the integer division rounds up (not down)
664 : ! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
665 : ! performance for loopb in ice_reprosum_int)
666 0 : max_level = (64/nflds) + 1
667 0 : do ifld=1,nflds
668 : max_levels(ifld) = 2 + &
669 : ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) & ! LCOV_EXCL_LINE
670 0 : / arr_max_shift)
671 0 : if ( present(arr_max_levels) .and. (.not. validate) ) then
672 : ! if validate true, then computation with arr_max_levels failed
673 : ! previously
674 0 : if ( arr_max_levels(ifld) > 0 ) then
675 : max_levels(ifld) = &
676 0 : min(arr_max_levels(ifld),max_levels(ifld))
677 : endif
678 : endif
679 0 : if (max_level < max_levels(ifld)) &
680 0 : max_level = max_levels(ifld)
681 : enddo
682 :
683 : ! if requested, return calculated levels
684 0 : if ( present(arr_max_levels_out) ) then
685 0 : do ifld=1,nflds
686 0 : arr_max_levels_out(ifld) = max_levels(ifld)
687 : enddo
688 : endif
689 :
690 : ! calculate sum
691 0 : validate = .false.
692 : call ice_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
693 : arr_max_shift, arr_gmax_exp, max_levels, & ! LCOV_EXCL_LINE
694 : max_level, validate, recompute, & ! LCOV_EXCL_LINE
695 0 : omp_nthreads, mpi_comm)
696 :
697 : endif
698 :
699 : ! if (detailed_timing) call xicex_timer_stop('ice_reprosum_int')
700 :
701 : endif
702 :
703 : ! compare fixed and floating point results
704 0 : if ( present(rel_diff) ) then
705 0 : if (ice_reprosum_reldiffmax >= 0.0_r8) then
706 :
707 : ! if (detailed_timing) then
708 : ! call xicex_timer_start('xicex_nonreprosum_prebarrier')
709 : ! call xicex_mpi_barrier(mpi_comm,subname)
710 : ! call xicex_timer_stop ('xicex_nonreprosum_prebarrier')
711 : ! endif
712 :
713 : ! if (detailed_timing) call xicex_timer_start('nonrepro_sum')
714 : ! record statistic
715 0 : nonrepro_sum = 1
716 : ! compute nonreproducible sum
717 0 : arr_lsum(:) = 0._r8
718 : !$omp parallel do &
719 : !$omp default(shared) & ! LCOV_EXCL_LINE
720 : !$omp private(ifld, isum)
721 0 : do ifld=1,nflds
722 0 : do isum=1,nsummands
723 0 : arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
724 : end do
725 : end do
726 :
727 : #ifdef SERIAL_REMOVE_MPI
728 0 : arr_gsum_fast = arr_lsum
729 : #else
730 : call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
731 : MPI_REAL8, MPI_SUM, mpi_comm, ierr)
732 : #endif
733 :
734 : ! if (detailed_timing) call xicex_timer_stop('nonrepro_sum')
735 :
736 : ! determine differences
737 : !$omp parallel do &
738 : !$omp default(shared) & ! LCOV_EXCL_LINE
739 : !$omp private(ifld, abs_diff)
740 0 : do ifld=1,nflds
741 0 : abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
742 0 : if (abs(arr_gsum(ifld)) > abs_diff) then
743 0 : rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
744 : else
745 0 : rel_diff(1,ifld) = abs_diff
746 : endif
747 0 : rel_diff(2,ifld) = abs_diff
748 : enddo
749 : else
750 0 : rel_diff(:,:) = 0.0_r8
751 : endif
752 : endif
753 :
754 : ! return statistics
755 0 : if ( present(repro_sum_stats) ) then
756 0 : repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
757 0 : repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
758 0 : repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
759 0 : repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
760 0 : repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
761 : endif
762 :
763 :
764 0 : end subroutine ice_reprosum_calc
765 :
766 : !========================================================================
767 : !
768 : ! Purpose:
769 : !> Compute the global sum of each field in "arr" using the indicated
770 : !> communicator with a reproducible yet scalable implementation based
771 : !> on a fixed point algorithm. The accuracy of the fixed point algorithm
772 : !> is controlled by the number of "levels" of integer expansion, the
773 : !> maximum value of which is specified by max_level.
774 : !
775 : !----------------------------------------------------------------------
776 :
777 0 : subroutine ice_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
778 : arr_max_shift, arr_gmax_exp, max_levels, & ! LCOV_EXCL_LINE
779 : max_level, validate, recompute, & ! LCOV_EXCL_LINE
780 : omp_nthreads, mpi_comm )
781 :
782 : !----------------------------------------------------------------------
783 :
784 : ! Arguments
785 :
786 : integer, intent(in) :: nsummands !< number of local summands
787 : integer, intent(in) :: dsummands !< declared first dimension
788 : integer, intent(in) :: nflds !< number of fields
789 : integer, intent(in) :: arr_max_shift !< maximum safe exponent for
790 : !< value < 1 (so that sum
791 : !< does not overflow)
792 : integer, intent(in) :: arr_gmax_exp(nflds)
793 : !< exponents of global maxima
794 : integer, intent(in) :: max_levels(nflds)
795 : !< maximum number of levels
796 : !< of integer expansion
797 : integer, intent(in) :: max_level !< maximum value in
798 : !< max_levels
799 : integer, intent(in) :: omp_nthreads !< number of OpenMP threads
800 : integer, intent(in) :: mpi_comm !< MPI subcommunicator
801 :
802 : real(r8), intent(in) :: arr(dsummands,nflds)
803 : !< input array
804 :
805 : logical, intent(in):: validate
806 : !< flag indicating that accuracy of solution generated from
807 : !< arr_gmax_exp and max_levels should be tested
808 :
809 : logical, intent(out):: recompute
810 : !< flag indicating that either the upper bounds are inaccurate,
811 : !< or max_levels and arr_gmax_exp do not generate accurate
812 : !< enough sums
813 :
814 : real(r8), intent(out):: arr_gsum(nflds) !< global means
815 :
816 : ! Local workspace
817 :
818 : integer, parameter :: max_jlevel = &
819 : 1 + (digits(0_i8)/digits(0.0_r8))
820 :
821 0 : integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
822 : ! integer vector representing local
823 : ! sum (per thread, per field)
824 0 : integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
825 : ! integer vector representing local
826 : ! sum
827 : integer(i8) :: i8_arr_level ! integer part of summand for current
828 : ! expansion level
829 0 : integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
830 : ! integer vector representing global
831 : ! sum
832 : integer(i8) :: IX_8 ! integer representation of current
833 : ! jlevels of X_8 ('part' of
834 : ! i8_arr_gsum_level)
835 : integer(i8) :: i8_sign ! sign global sum
836 : integer(i8) :: i8_radix ! radix for i8 variables
837 :
838 0 : integer :: max_error(nflds,omp_nthreads)
839 : ! accurate upper bound on data?
840 0 : integer :: not_exact(nflds,omp_nthreads)
841 : ! max_levels sufficient to
842 : ! capture all digits?
843 0 : integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
844 : ! range of summand indices for each
845 : ! OpenMP thread
846 : integer :: ifld, isum, ithread
847 : ! loop variables
848 : integer :: arr_exp ! exponent of summand
849 : integer :: arr_shift ! exponent used to generate integer
850 : ! for current expansion level
851 : integer :: ilevel ! current integer expansion level
852 0 : integer :: offset(nflds) ! beginning location in
853 : ! i8_arr_{g,l}sum_level for integer
854 : ! expansion of current ifld
855 : integer :: voffset ! modification to offset used to
856 : ! include validation metrics
857 : integer :: ioffset ! offset(ifld)
858 : integer :: jlevel ! number of floating point 'pieces'
859 : ! extracted from a given i8 integer
860 : integer :: ierr ! MPI error return
861 : integer :: LX(max_jlevel) ! exponent of X_8 (see below)
862 : integer :: veclth ! total length of i8_arr_lsum_level
863 : integer :: sum_digits ! lower bound on number of significant
864 : ! in integer expansion of sum
865 : integer :: curr_exp ! exponent of partial sum during
866 : ! reconstruction from integer vector
867 : integer :: corr_exp ! exponent of current summand in
868 : ! reconstruction from integer vector
869 :
870 : real(r8) :: arr_frac ! fraction of summand
871 : real(r8) :: arr_remainder ! part of summand remaining after
872 : ! current level of integer expansion
873 : real(r8) :: X_8(max_jlevel) ! r8 vector representation of current
874 : ! i8_arr_gsum_level
875 : real(r8) :: RX_8 ! r8 representation of difference
876 : ! between current i8_arr_gsum_level
877 : ! and current jlevels of X_8
878 : ! (== IX_8). Also used in final
879 : ! scaling step
880 :
881 : logical :: first ! flag used to indicate that just
882 : ! beginning reconstruction of sum
883 : ! from integer vector
884 :
885 : character(len=*),parameter :: subname = '(ice_reprosum_int)'
886 :
887 : !-----------------------------------------------------------------------
888 : ! Save radix of i8 variables in an i8 variable
889 0 : i8_radix = radix(IX_8)
890 :
891 : ! If validating upper bounds, reserve space for validation metrics
892 : ! In both cases, reserve an extra level for overflow from the top level
893 0 : if (validate) then
894 0 : voffset = 3
895 : else
896 0 : voffset = 1
897 : endif
898 :
899 : ! compute offsets for each field
900 0 : offset(1) = voffset
901 0 : do ifld=2,nflds
902 : offset(ifld) = offset(ifld-1) &
903 0 : + (max_levels(ifld-1) + voffset)
904 : enddo
905 0 : veclth = offset(nflds) + max_levels(nflds)
906 :
907 : ! split summand index range over OpenMP threads
908 0 : call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
909 :
910 : ! convert local summands to vector of integers and sum
911 : ! (Using scale instead of set_exponent because arr_remainder may not be
912 : ! "normal" after level 1 calculation)
913 0 : i8_arr_lsum_level(:) = 0_i8
914 :
915 : !$omp parallel do &
916 : !$omp default(shared) & ! LCOV_EXCL_LINE
917 : !$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, & ! LCOV_EXCL_LINE
918 : !$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
919 0 : do ithread=1,omp_nthreads
920 : ! if (detailed_timing) call xicex_timer_start('repro_sum_loopb')
921 0 : do ifld=1,nflds
922 0 : ioffset = offset(ifld)
923 :
924 0 : max_error(ifld,ithread) = 0
925 0 : not_exact(ifld,ithread) = 0
926 :
927 0 : i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
928 0 : do isum=isum_beg(ithread),isum_end(ithread)
929 0 : arr_remainder = 0.0_r8
930 :
931 0 : if (arr(isum,ifld) .ne. 0.0_r8) then
932 0 : arr_exp = exponent(arr(isum,ifld))
933 0 : arr_frac = fraction(arr(isum,ifld))
934 :
935 : ! test that global maximum upper bound is an upper bound
936 0 : if (arr_exp > arr_gmax_exp(ifld)) then
937 0 : max_error(ifld,ithread) = 1
938 0 : exit
939 : endif
940 :
941 : ! calculate first shift
942 0 : arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
943 :
944 : ! determine first (probably) nonzero level (assuming initial fraction is
945 : ! 'normal' - algorithm still works if this is not true)
946 : ! NOTE: this is critical; scale will set to zero if min exponent is too small.
947 0 : if (arr_shift < 1) then
948 0 : ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
949 0 : arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
950 :
951 0 : do while (arr_shift < 1)
952 0 : arr_shift = arr_shift + arr_max_shift
953 0 : ilevel = ilevel + 1
954 : enddo
955 : else
956 0 : ilevel = 1
957 : endif
958 :
959 0 : if (ilevel .le. max_levels(ifld)) then
960 : ! apply first shift/truncate, add it to the relevant running
961 : ! sum, and calculate the remainder.
962 0 : arr_remainder = scale(arr_frac,arr_shift)
963 0 : i8_arr_level = int(arr_remainder,i8)
964 : i8_arr_tlsum_level(ilevel,ifld,ithread) = &
965 0 : i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
966 0 : arr_remainder = arr_remainder - i8_arr_level
967 :
968 : ! while the remainder is non-zero, continue to shift, truncate,
969 : ! sum, and calculate new remainder
970 : do while ((arr_remainder .ne. 0.0_r8) &
971 0 : .and. (ilevel < max_levels(ifld)))
972 0 : ilevel = ilevel + 1
973 0 : arr_remainder = scale(arr_remainder,arr_max_shift)
974 0 : i8_arr_level = int(arr_remainder,i8)
975 : i8_arr_tlsum_level(ilevel,ifld,ithread) = &
976 0 : i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
977 0 : arr_remainder = arr_remainder - i8_arr_level
978 : enddo
979 :
980 : endif
981 : endif
982 :
983 0 : if (arr_remainder .ne. 0.0_r8) then
984 0 : not_exact(ifld,ithread) = 1
985 : endif
986 :
987 : enddo
988 :
989 : ! postprocess integer vector to eliminate potential for overlap in the following
990 : ! sums over threads and processes: if value larger than or equal to
991 : ! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
992 : ! vector, resulting in nonoverlapping ranges for each component. Note that
993 : ! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
994 : ! over threads and processes do not overflow for ilevel==1.
995 0 : do ilevel=max_levels(ifld),1,-1
996 0 : RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
997 0 : IX_8 = int(scale(RX_8,-arr_max_shift),i8)
998 0 : if (IX_8 .ne. 0_i8) then
999 : i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
1000 0 : i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
1001 0 : IX_8 = IX_8*(i8_radix**arr_max_shift)
1002 : i8_arr_tlsum_level(ilevel,ifld,ithread) = &
1003 0 : i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
1004 : endif
1005 : enddo
1006 : enddo
1007 : ! if (detailed_timing) call xicex_timer_stop('repro_sum_loopb')
1008 : enddo
1009 :
1010 : ! sum contributions from different threads
1011 0 : do ifld=1,nflds
1012 0 : ioffset = offset(ifld)
1013 0 : do ithread = 1,omp_nthreads
1014 0 : do ilevel = 0,max_levels(ifld)
1015 : i8_arr_lsum_level(ioffset+ilevel) = &
1016 : i8_arr_lsum_level(ioffset+ilevel) & ! LCOV_EXCL_LINE
1017 0 : + i8_arr_tlsum_level(ilevel,ifld,ithread)
1018 : enddo
1019 : enddo
1020 : enddo
1021 :
1022 : ! record if upper bound was inaccurate or if level expansion stopped
1023 : ! before full accuracy was achieved
1024 0 : if (validate) then
1025 0 : do ifld=1,nflds
1026 0 : ioffset = offset(ifld)
1027 0 : i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:))
1028 0 : i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:))
1029 : enddo
1030 : endif
1031 :
1032 : ! sum integer vector element-wise
1033 : #ifdef SERIAL_REMOVE_MPI
1034 0 : i8_arr_gsum_level = i8_arr_lsum_level
1035 : #else
1036 : #if defined (NO_I8)
1037 : ! Workaround for when i8 is not supported.
1038 : ! if (detailed_timing) call xicex_timer_start("repro_sum_allr_i4")
1039 : call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
1040 : veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
1041 : ! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_i4")
1042 : #else
1043 : ! if (detailed_timing) call xicex_timer_start("repro_sum_allr_i8")
1044 : call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
1045 : veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
1046 : ! if (detailed_timing) call xicex_timer_stop("repro_sum_allr_i8")
1047 : #endif
1048 : #endif
1049 :
1050 : ! Construct global sum from integer vector representation:
1051 : ! 1) arr_max_shift is the shift applied to fraction(arr_gmax) .
1052 : ! When shifting back, need to "add back in" true arr_gmax exponent. This was
1053 : ! removed implicitly by working only with the fraction .
1054 : ! 2) want to add levels into sum in reverse order (smallest to largest). However,
1055 : ! even this can generate floating point rounding errors if signs of integers
1056 : ! alternate. To avoid this, do some arithmetic with integer vectors so that all
1057 : ! components have the same sign. This should keep relative difference between
1058 : ! using different integer sizes (e.g. i8 and i4) to machine epsilon
1059 : ! 3) assignment to X_8 will usually lose accuracy since maximum integer
1060 : ! size is greater than the max number of 'digits' in r8 value (if xmax_nsummands
1061 : ! correction not very large). Calculate remainder and add in first (since
1062 : ! smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits).
1063 : ! For r4 (24 digits) may need to correct twice. Code is written in a general
1064 : ! fashion, to work no matter how many corrections are necessary (assuming
1065 : ! max_jlevel parameter calculation is correct).
1066 :
1067 0 : recompute = .false.
1068 0 : do ifld=1,nflds
1069 0 : arr_gsum(ifld) = 0.0_r8
1070 0 : ioffset = offset(ifld)
1071 :
1072 : ! if validate is .true., test whether the summand upper bound
1073 : ! was exceeded on any of the processes
1074 0 : if (validate) then
1075 0 : if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then
1076 0 : recompute = .true.
1077 : endif
1078 : endif
1079 :
1080 0 : if (.not. recompute) then
1081 :
1082 : ! preprocess integer vector:
1083 : ! a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap'
1084 : ! to next larger integer in vector, resulting in nonoverlapping ranges for each
1085 : ! component. Note that have "ilevel-1=0" level here as described above.
1086 0 : do ilevel=max_levels(ifld),1,-1
1087 0 : RX_8 = i8_arr_gsum_level(ioffset+ilevel)
1088 0 : IX_8 = int(scale(RX_8,-arr_max_shift),i8)
1089 0 : if (IX_8 .ne. 0_i8) then
1090 : i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) &
1091 0 : + IX_8
1092 0 : IX_8 = IX_8*(i8_radix**arr_max_shift)
1093 : i8_arr_gsum_level(ioffset+ilevel) = i8_arr_gsum_level(ioffset+ilevel) &
1094 0 : - IX_8
1095 : endif
1096 : enddo
1097 : ! b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary
1098 : ! so that all vector components have the same sign (eliminating loss
1099 : ! of accuracy arising from difference of large values when
1100 : ! reconstructing r8 sum from integer vector)
1101 0 : ilevel = 0
1102 : do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) &
1103 0 : .and. (ilevel < max_levels(ifld)))
1104 0 : ilevel = ilevel + 1
1105 : enddo
1106 : !
1107 0 : if (ilevel < max_levels(ifld)) then
1108 0 : if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then
1109 0 : i8_sign = 1_i8
1110 : else
1111 0 : i8_sign = -1_i8
1112 : endif
1113 0 : do jlevel=ilevel,max_levels(ifld)-1
1114 0 : if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) &
1115 0 : .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then
1116 : i8_arr_gsum_level(ioffset+jlevel) = i8_arr_gsum_level(ioffset+jlevel) &
1117 0 : - i8_sign
1118 : i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) &
1119 0 : + i8_sign*(i8_radix**arr_max_shift)
1120 : endif
1121 : enddo
1122 : endif
1123 :
1124 : ! start with maximum shift, and work up to larger values
1125 : arr_shift = arr_gmax_exp(ifld) &
1126 0 : - max_levels(ifld)*arr_max_shift
1127 0 : curr_exp = 0
1128 0 : first = .true.
1129 0 : do ilevel=max_levels(ifld),0,-1
1130 :
1131 0 : if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
1132 0 : jlevel = 1
1133 :
1134 : ! r8 representation of higher order bits in integer
1135 0 : X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel)
1136 0 : LX(jlevel) = exponent(X_8(jlevel))
1137 :
1138 : ! calculate remainder
1139 0 : IX_8 = int(X_8(jlevel),i8)
1140 0 : RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
1141 :
1142 : ! repeat using remainder
1143 0 : do while (RX_8 .ne. 0.0_r8)
1144 0 : jlevel = jlevel + 1
1145 0 : X_8(jlevel) = RX_8
1146 0 : LX(jlevel) = exponent(RX_8)
1147 0 : IX_8 = IX_8 + int(RX_8,i8)
1148 0 : RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
1149 : enddo
1150 :
1151 : ! add in contributions, smaller to larger, rescaling for each
1152 : ! addition to guarantee that exponent of working summand is always
1153 : ! larger than minexponent
1154 0 : do while (jlevel > 0)
1155 0 : if (first) then
1156 0 : curr_exp = LX(jlevel) + arr_shift
1157 0 : arr_gsum(ifld) = fraction(X_8(jlevel))
1158 0 : first = .false.
1159 : else
1160 0 : corr_exp = curr_exp - (LX(jlevel) + arr_shift)
1161 : arr_gsum(ifld) = fraction(X_8(jlevel)) &
1162 0 : + scale(arr_gsum(ifld),corr_exp)
1163 0 : curr_exp = LX(jlevel) + arr_shift
1164 : endif
1165 0 : jlevel = jlevel - 1
1166 : enddo
1167 :
1168 : endif
1169 :
1170 0 : arr_shift = arr_shift + arr_max_shift
1171 : enddo
1172 :
1173 : ! apply final exponent correction, scaling first if exponent is too small
1174 : ! to apply directly
1175 0 : corr_exp = curr_exp + exponent(arr_gsum(ifld))
1176 0 : if (corr_exp .ge. MINEXPONENT(1._r8)) then
1177 0 : arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp)
1178 : else
1179 : RX_8 = set_exponent(arr_gsum(ifld), &
1180 0 : corr_exp-MINEXPONENT(1._r8))
1181 0 : arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
1182 : endif
1183 :
1184 : ! if validate is .true. and some precision lost, test whether 'too much'
1185 : ! was lost, due to too loose an upper bound, too stringent a limit on number
1186 : ! of levels of expansion, cancellation, .... Calculated by comparing lower
1187 : ! bound on number of sigificant digits with number of digits in 1.0_r8 .
1188 0 : if (validate) then
1189 0 : if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then
1190 :
1191 : ! find first nonzero level and use exponent for this level, then assume all
1192 : ! subsequent levels contribute arr_max_shift digits.
1193 0 : sum_digits = 0
1194 0 : do ilevel=0,max_levels(ifld)
1195 0 : if (sum_digits .eq. 0) then
1196 0 : if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
1197 0 : X_8(1) = i8_arr_gsum_level(ioffset+ilevel)
1198 0 : LX(1) = exponent(X_8(1))
1199 0 : sum_digits = LX(1)
1200 : endif
1201 : else
1202 0 : sum_digits = sum_digits + arr_max_shift
1203 : endif
1204 : enddo
1205 :
1206 0 : if (sum_digits < digits(1.0_r8)) then
1207 0 : recompute = .true.
1208 : endif
1209 : endif
1210 : endif
1211 :
1212 : endif
1213 :
1214 : enddo
1215 :
1216 :
1217 0 : end subroutine ice_reprosum_int
1218 :
1219 : !========================================================================
1220 : !
1221 : ! Purpose:
1222 : !> Test whether distributed sum exceeds tolerance and print out a
1223 : !> warning message.
1224 : !
1225 : !----------------------------------------------------------------------
1226 :
1227 0 : logical function ice_reprosum_tolExceeded (name, nflds, master, &
1228 0 : logunit, rel_diff )
1229 : !----------------------------------------------------------------------
1230 :
1231 : ! Arguments
1232 :
1233 : character(len=*), intent(in) :: name !< distributed sum identifier
1234 : integer, intent(in) :: nflds !< number of fields
1235 : logical, intent(in) :: master !< process that will write
1236 : !< warning messages?
1237 : integer, optional, intent(in) :: logunit!< unit warning messages
1238 : !< written to
1239 : real(r8), intent(in) :: rel_diff(2,nflds)
1240 : !< relative and absolute
1241 : !< differences between fixed
1242 : !< and floating point sums
1243 :
1244 : ! Local workspace
1245 :
1246 : integer :: llogunit ! local log unit
1247 : integer :: ifld ! field index
1248 : integer :: exceeds_limit ! number of fields whose
1249 : ! sum exceeds tolerance
1250 : real(r8) :: max_rel_diff ! maximum relative difference
1251 : integer :: max_rel_diff_idx ! field index for max. rel. diff.
1252 : real(r8) :: max_abs_diff ! maximum absolute difference
1253 : integer :: max_abs_diff_idx ! field index for max. abs. diff.
1254 : character(len=*),parameter :: subname = '(ice_reprosum_tolExceeded)'
1255 :
1256 : !-----------------------------------------------------------------------
1257 :
1258 0 : ice_reprosum_tolExceeded = .false.
1259 0 : if (ice_reprosum_reldiffmax < 0.0_r8) return
1260 :
1261 0 : if ( present(logunit) ) then
1262 0 : llogunit = logunit
1263 : else
1264 0 : llogunit = nu_diag
1265 : endif
1266 :
1267 : ! check that "fast" reproducible sum is accurate enough.
1268 0 : exceeds_limit = 0
1269 0 : max_rel_diff = 0.0_r8
1270 0 : max_abs_diff = 0.0_r8
1271 0 : do ifld=1,nflds
1272 0 : if (rel_diff(1,ifld) > ice_reprosum_reldiffmax) then
1273 0 : exceeds_limit = exceeds_limit + 1
1274 0 : if (rel_diff(1,ifld) > max_rel_diff) then
1275 0 : max_rel_diff = rel_diff(1,ifld)
1276 0 : max_rel_diff_idx = ifld
1277 : endif
1278 0 : if (rel_diff(2,ifld) > max_abs_diff) then
1279 0 : max_abs_diff = rel_diff(2,ifld)
1280 0 : max_abs_diff_idx = ifld
1281 : endif
1282 : endif
1283 : enddo
1284 :
1285 0 : if (exceeds_limit > 0) then
1286 0 : if (master) then
1287 0 : write(llogunit,*) subname,trim(name), &
1288 : ': difference in fixed and floating point sums ', & ! LCOV_EXCL_LINE
1289 : ' exceeds tolerance in ', exceeds_limit, & ! LCOV_EXCL_LINE
1290 0 : ' fields.'
1291 0 : write(llogunit,*) subname,' Maximum relative diff: (rel)', &
1292 : rel_diff(1,max_rel_diff_idx), ' (abs) ', & ! LCOV_EXCL_LINE
1293 0 : rel_diff(2,max_rel_diff_idx)
1294 0 : write(llogunit,*) subname,' Maximum absolute diff: (rel)', &
1295 : rel_diff(1,max_abs_diff_idx), ' (abs) ', & ! LCOV_EXCL_LINE
1296 0 : rel_diff(2,max_abs_diff_idx)
1297 : endif
1298 0 : ice_reprosum_tolExceeded = .true.
1299 : endif
1300 :
1301 :
1302 0 : end function ice_reprosum_tolExceeded
1303 :
1304 : !========================================================================
1305 : !
1306 : ! Purpose:
1307 : !> Compute the global sum of each field in "arr" using the indicated
1308 : !> communicator with a reproducible yet scalable implementation based
1309 : !> on He and Ding's implementation of the double-double algorithm.
1310 : !
1311 : !----------------------------------------------------------------------
1312 :
1313 0 : subroutine ice_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, &
1314 : nflds, mpi_comm )
1315 : !----------------------------------------------------------------------
1316 :
1317 : ! Arguments
1318 :
1319 : integer, intent(in) :: nsummands !< number of local summands
1320 : integer, intent(in) :: dsummands !< declared first dimension
1321 : integer, intent(in) :: nflds !< number of fields
1322 : real(r8), intent(in) :: arr(dsummands,nflds)
1323 : !< input array
1324 : integer, intent(in) :: mpi_comm !< MPI subcommunicator
1325 :
1326 : real(r8), intent(out):: arr_gsum(nflds)
1327 : !< global sums
1328 :
1329 :
1330 : ! Local workspace
1331 :
1332 : integer :: old_cw ! for x86 processors, save
1333 : ! current arithmetic mode
1334 : integer :: ifld, isum ! loop variables
1335 : integer :: ierr ! MPI error return
1336 :
1337 : real(r8) :: e, t1, t2 ! temporaries
1338 0 : complex(r8) :: arr_lsum_dd(nflds) ! local sums (in double-double
1339 : ! format)
1340 0 : complex(r8) :: arr_gsum_dd(nflds) ! global sums (in double-double
1341 : ! format)
1342 :
1343 : integer, save :: mpi_sumdd
1344 : logical, save :: first_time = .true.
1345 : character(len=*),parameter :: subname = '(ice_reprosum_ddpdd)'
1346 :
1347 : !-----------------------------------------------------------------------
1348 :
1349 0 : call ice_shr_reprosumx86_fix_start (old_cw)
1350 :
1351 0 : if (first_time) then
1352 : #ifdef SERIAL_REMOVE_MPI
1353 0 : mpi_sumdd = 0
1354 : #else
1355 : call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
1356 : #endif
1357 0 : first_time = .false.
1358 : endif
1359 :
1360 0 : do ifld=1,nflds
1361 0 : arr_lsum_dd(ifld) = (0.0_r8,0.0_r8)
1362 :
1363 0 : do isum=1,nsummands
1364 :
1365 : ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s
1366 : ! trick.
1367 0 : t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
1368 0 : e = t1 - arr(isum,ifld)
1369 : t2 = ((real(arr_lsum_dd(ifld)) - e) &
1370 : + (arr(isum,ifld) - (t1 - e))) & ! LCOV_EXCL_LINE
1371 0 : + aimag(arr_lsum_dd(ifld))
1372 :
1373 : ! The result is t1 + t2, after normalization.
1374 0 : arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
1375 : enddo
1376 :
1377 : enddo
1378 :
1379 : #ifdef SERIAL_REMOVE_MPI
1380 0 : arr_gsum_dd = arr_lsum_dd
1381 : #else
1382 : call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
1383 : MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
1384 : #endif
1385 0 : do ifld=1,nflds
1386 0 : arr_gsum(ifld) = real(arr_gsum_dd(ifld))
1387 : enddo
1388 :
1389 0 : call ice_shr_reprosumx86_fix_end (old_cw)
1390 :
1391 0 : end subroutine ice_reprosum_ddpdd
1392 :
1393 : !-----------------------------------------------------------------------
1394 :
1395 0 : subroutine DDPDD (dda, ddb, len, itype)
1396 : !----------------------------------------------------------------------
1397 : !
1398 : ! Purpose:
1399 : ! Modification of original codes written by David H. Bailey
1400 : ! This subroutine computes ddb(i) = dda(i)+ddb(i)
1401 : !
1402 : !----------------------------------------------------------------------
1403 :
1404 : ! Arguments
1405 :
1406 : integer, intent(in) :: len ! array length
1407 : complex(r8), intent(in) :: dda(len) ! input
1408 : complex(r8), intent(inout) :: ddb(len) ! result
1409 : integer, intent(in) :: itype ! unused
1410 :
1411 : ! Local workspace
1412 :
1413 : real(r8) e, t1, t2
1414 : integer i
1415 : character(len=*),parameter :: subname = '(ice_reprosum_mod:DDPDD)'
1416 :
1417 : !-----------------------------------------------------------------------
1418 :
1419 0 : do i = 1, len
1420 : ! Compute dda + ddb using Knuth's trick.
1421 0 : t1 = real(dda(i)) + real(ddb(i))
1422 0 : e = t1 - real(dda(i))
1423 : t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
1424 0 : + aimag(dda(i)) + aimag(ddb(i))
1425 :
1426 : ! The result is t1 + t2, after normalization.
1427 0 : ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
1428 : enddo
1429 :
1430 :
1431 0 : end subroutine DDPDD
1432 :
1433 : !-----------------------------------------------------------------------
1434 :
1435 0 : subroutine split_indices(total,num_pieces,ibeg,iend)
1436 : !----------------------------------------------------------------------
1437 : !
1438 : ! Purpose:
1439 : ! Split range into 'num_pieces'
1440 : !
1441 : !----------------------------------------------------------------------
1442 :
1443 : ! Arguments
1444 :
1445 : integer, intent(in) :: total
1446 : integer, intent(in) :: num_pieces
1447 : integer, intent(out) :: ibeg(num_pieces), iend(num_pieces)
1448 :
1449 : ! Local workspace
1450 :
1451 : integer :: itmp1, itmp2, ioffset, i
1452 : character(len=*),parameter :: subname = '(ice_reprosum_mod:split_indices)'
1453 :
1454 : !-----------------------------------------------------------------------
1455 :
1456 0 : itmp1 = total/num_pieces
1457 0 : itmp2 = mod(total,num_pieces)
1458 0 : ioffset = 0
1459 0 : do i=1,itmp2
1460 0 : ibeg(i) = ioffset + 1
1461 0 : iend(i) = ioffset + (itmp1+1)
1462 0 : ioffset = iend(i)
1463 : enddo
1464 0 : do i=itmp2+1,num_pieces
1465 0 : ibeg(i) = ioffset + 1
1466 0 : if (ibeg(i) > total) then
1467 0 : iend(i) = ibeg(i) - 1
1468 : else
1469 0 : iend(i) = ioffset + itmp1
1470 0 : ioffset = iend(i)
1471 : endif
1472 : enddo
1473 :
1474 0 : end subroutine split_indices
1475 :
1476 : !========================================================================
1477 :
1478 : end module ice_reprosum
|