Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
technical:whitepaper:darwin_ucx_openmpi [2021-02-12 16:47] – frey | technical:whitepaper:darwin_ucx_openmpi [2021-02-12 17:15] (current) – frey | ||
---|---|---|---|
Line 72: | Line 72: | ||
The UCX changes implemented in Open MPI 4.1.0 have not been backported to any of the older releases of the software, so this issue does persist. | The UCX changes implemented in Open MPI 4.1.0 have not been backported to any of the older releases of the software, so this issue does persist. | ||
- | < | + | < |
- | Allocate(Rmat(2, | + | Allocate(Rmat(2, |
- | : | + | : |
- | Call MPI_Bcast(Rmat, | + | Call MPI_Bcast(Rmat, |
</ | </ | ||
- | The array in question is 1154079760 | + | The array in question is 1154079760 |
+ | |||
+ | The OpenFabrics Alliance produces a unified messaging library similar to UCX called OFI or libfabric. | ||
+ | |||
+ | Unfortunately, | ||
+ | < | ||
+ | Message size 1154079760 bigger than supported by selected transport. Max = 1073741824 | ||
+ | [r2x00: | ||
+ | </ | ||
+ | The size cited is exactly the size of the '' | ||
+ | |||
+ | So the implication is that Open MPI releases prior to 4.1.0 cannot internally handle an '' | ||
+ | |||
+ | ===== Chunked Broadcast ===== | ||
+ | |||
+ | Open MPI MTL does not yet implement " | ||
+ | |||
+ | <file fortran mpi_utils.f90> | ||
+ | Module mpi_utils | ||
+ | ! | ||
+ | ! This module implements MPI broadcast functions that break a buffer | ||
+ | ! that may be too large into multiple chunks. | ||
+ | ! to each of the subroutines dictates how many 8-byte elements should | ||
+ | ! be passed to each underlying MPI_Bcast(). | ||
+ | ! | ||
+ | ! If the < | ||
+ | ! is checked for an integer value. | ||
+ | ! exceeds the maximum threshold, the maximum threshold is used. The | ||
+ | ! maximum equates to 1 GiB worth of elements. | ||
+ | ! | ||
+ | Implicit None | ||
+ | |||
+ | Private | ||
+ | |||
+ | Public :: BroadcastI, BroadcastR, BroadcastD | ||
+ | |||
+ | Integer, Parameter :: MaxStride8Byte = 134217728 | ||
+ | |||
+ | Contains | ||
+ | |||
+ | Subroutine BroadcastI(buffer, | ||
+ | Use mpi_f08 | ||
+ | Implicit None | ||
+ | |||
+ | Type(MPI_Comm), | ||
+ | Integer, Intent(In) | ||
+ | Integer, Dimension(*), | ||
+ | Integer, Intent(InOut) | ||
+ | Character(Len=255) | ||
+ | Integer | ||
+ | |||
+ | If (stride .le. 0) Then | ||
+ | Call GetEnv(' | ||
+ | Read(envvar, | ||
+ | If (use_stride .le. 0) use_stride = MaxStride8Byte * 2 | ||
+ | Else | ||
+ | use_stride = stride | ||
+ | End If | ||
+ | If (stride .gt. MaxStride8Byte * 2) use_stride = MaxStride8Byte * 2 | ||
+ | count_remain = count | ||
+ | i = 1 | ||
+ | mpierr = 0 | ||
+ | Do While (count_remain .gt. use_stride) | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | count_remain = count_remain - use_stride | ||
+ | i = i + use_stride | ||
+ | End Do | ||
+ | If (count_remain .gt. 0) Then | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | End If | ||
+ | End Subroutine BroadcastI | ||
+ | |||
+ | Subroutine BroadcastR(buffer, | ||
+ | Use mpi_f08 | ||
+ | Implicit None | ||
+ | |||
+ | Type(MPI_Comm), | ||
+ | Integer, Intent(In) | ||
+ | Real, Dimension(*), | ||
+ | Integer, Intent(InOut) | ||
+ | Character(Len=255) | ||
+ | Integer | ||
+ | |||
+ | If (stride .le. 0) Then | ||
+ | Call GetEnv(' | ||
+ | Read(envvar, | ||
+ | If (use_stride .le. 0) use_stride = MaxStride8Byte * 2 | ||
+ | Else | ||
+ | use_stride = stride | ||
+ | End If | ||
+ | If (stride .gt. MaxStride8Byte * 2) use_stride = MaxStride8Byte * 2 | ||
+ | count_remain = count | ||
+ | i = 1 | ||
+ | mpierr = 0 | ||
+ | Do While (count_remain .gt. use_stride) | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | count_remain = count_remain - use_stride | ||
+ | i = i + use_stride | ||
+ | End Do | ||
+ | If (count_remain .gt. 0) Then | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | End If | ||
+ | End Subroutine BroadcastR | ||
+ | |||
+ | Subroutine BroadcastD(buffer, | ||
+ | Use mpi_f08 | ||
+ | Use, intrinsic :: iso_fortran_env | ||
+ | Implicit None | ||
+ | |||
+ | Type(MPI_Comm), | ||
+ | Integer, Intent(In) | ||
+ | Real(real64), | ||
+ | Integer, Intent(InOut) | ||
+ | Character(Len=255) | ||
+ | Integer | ||
+ | |||
+ | If (stride .le. 0) Then | ||
+ | Call GetEnv(' | ||
+ | Read(envvar, | ||
+ | If (use_stride .le. 0) use_stride = MaxStride8Byte | ||
+ | Else | ||
+ | use_stride = stride | ||
+ | End If | ||
+ | If (stride .gt. MaxStride8Byte) use_stride = MaxStride8Byte | ||
+ | count_remain = count | ||
+ | i = 1 | ||
+ | mpierr = 0 | ||
+ | Do While (count_remain .gt. use_stride) | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | count_remain = count_remain - use_stride | ||
+ | i = i + use_stride | ||
+ | End Do | ||
+ | If (count_remain .gt. 0) Then | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | End If | ||
+ | End Subroutine BroadcastD | ||
+ | |||
+ | End Module | ||
+ | </ | ||
+ | The '' | ||
+ | |||
+ | Using this module, the previous code is transformed to: | ||
+ | <code fortran> | ||
+ | Use mpi_utils | ||
+ | : | ||
+ | Allocate(Rmat(2, | ||
+ | : | ||
+ | Call BroadcastR(Rmat, | ||
+ | </ | ||
+ | The data is now broadcast as a 268435456-element chunk followed by a 20084484-element chunk, both of which are well below the 1.0 GiB limit associated with the OFI MTL. The efficiency of MTL should be well in excess of the overhead involved in the " |