Differences
This shows you the differences between two versions of the page.
Next revision | Previous revision | ||
technical:whitepaper:darwin_ucx_openmpi [2021-02-12 16:18] – created frey | technical:whitepaper:darwin_ucx_openmpi [2021-02-12 17:15] (current) – frey | ||
---|---|---|---|
Line 48: | Line 48: | ||
</ | </ | ||
The '' | The '' | ||
+ | |||
+ | ===== Ongoing Errors ===== | ||
+ | |||
+ | With the MCA defaults outlined above, some applications were still seeing the issues that were accompanied by CREATE_MKEY kernel messages: | ||
+ | < | ||
+ | [Sat Feb 6 16:51:55 2021] infiniband mlx5_0: create_mkey_callback: | ||
+ | [Sat Feb 6 16:51:55 2021] mlx5_core 0000: | ||
+ | </ | ||
+ | The error number (-12) corresponds to '' | ||
+ | |||
+ | The Mellanox ConnectX-6 network interface breaks any //message// into a series of memory mappings of fixed size. The larger the memory region being communicated via RDMA, the more memory-mapping records are necessary. | ||
+ | - A large data transmission via RDMA is requested | ||
+ | - The Mellanox network interface' | ||
+ | - Data begins transmitting, | ||
+ | - Kernel messages are produced with each failed MTT addition request | ||
+ | - Once all MTT records have been added, the network interface processes the remainder while no kernel messages are produced | ||
+ | |||
+ | On occasion, the job did eventually crash or the node itself would encounter a kernel panic and go offline, so the issue was not a facile annoyance. | ||
+ | |||
+ | Google searches eventually turned up a known bug in all Open MPI releases prior to 4.1.0 with respect to newer releases of UCX. The UCX library began promoting a newer API for creating memory keys (recall '' | ||
+ | |||
+ | ===== Older Open MPI Releases ===== | ||
+ | |||
+ | The UCX changes implemented in Open MPI 4.1.0 have not been backported to any of the older releases of the software, so this issue does persist. | ||
+ | <code fortran> | ||
+ | Allocate(Rmat(2, | ||
+ | : | ||
+ | Call MPI_Bcast(Rmat, | ||
+ | </ | ||
+ | The array in question is 1154079760 Bytes (slightly over 1.0 GiB). | ||
+ | |||
+ | The OpenFabrics Alliance produces a unified messaging library similar to UCX called OFI or libfabric. | ||
+ | |||
+ | Unfortunately, | ||
+ | < | ||
+ | Message size 1154079760 bigger than supported by selected transport. Max = 1073741824 | ||
+ | [r2x00: | ||
+ | </ | ||
+ | The size cited is exactly the size of the '' | ||
+ | |||
+ | So the implication is that Open MPI releases prior to 4.1.0 cannot internally handle an '' | ||
+ | |||
+ | ===== Chunked Broadcast ===== | ||
+ | |||
+ | Open MPI MTL does not yet implement " | ||
+ | |||
+ | <file fortran mpi_utils.f90> | ||
+ | Module mpi_utils | ||
+ | ! | ||
+ | ! This module implements MPI broadcast functions that break a buffer | ||
+ | ! that may be too large into multiple chunks. | ||
+ | ! to each of the subroutines dictates how many 8-byte elements should | ||
+ | ! be passed to each underlying MPI_Bcast(). | ||
+ | ! | ||
+ | ! If the < | ||
+ | ! is checked for an integer value. | ||
+ | ! exceeds the maximum threshold, the maximum threshold is used. The | ||
+ | ! maximum equates to 1 GiB worth of elements. | ||
+ | ! | ||
+ | Implicit None | ||
+ | | ||
+ | Private | ||
+ | | ||
+ | Public :: BroadcastI, BroadcastR, BroadcastD | ||
+ | |||
+ | Integer, Parameter :: MaxStride8Byte = 134217728 | ||
+ | | ||
+ | Contains | ||
+ | |||
+ | Subroutine BroadcastI(buffer, | ||
+ | Use mpi_f08 | ||
+ | Implicit None | ||
+ | |||
+ | Type(MPI_Comm), | ||
+ | Integer, Intent(In) | ||
+ | Integer, Dimension(*), | ||
+ | Integer, Intent(InOut) | ||
+ | Character(Len=255) | ||
+ | Integer | ||
+ | |||
+ | If (stride .le. 0) Then | ||
+ | Call GetEnv(' | ||
+ | Read(envvar, | ||
+ | If (use_stride .le. 0) use_stride = MaxStride8Byte * 2 | ||
+ | Else | ||
+ | use_stride = stride | ||
+ | End If | ||
+ | If (stride .gt. MaxStride8Byte * 2) use_stride = MaxStride8Byte * 2 | ||
+ | count_remain = count | ||
+ | i = 1 | ||
+ | mpierr = 0 | ||
+ | Do While (count_remain .gt. use_stride) | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | count_remain = count_remain - use_stride | ||
+ | i = i + use_stride | ||
+ | End Do | ||
+ | If (count_remain .gt. 0) Then | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | End If | ||
+ | End Subroutine BroadcastI | ||
+ | |||
+ | Subroutine BroadcastR(buffer, | ||
+ | Use mpi_f08 | ||
+ | Implicit None | ||
+ | |||
+ | Type(MPI_Comm), | ||
+ | Integer, Intent(In) | ||
+ | Real, Dimension(*), | ||
+ | Integer, Intent(InOut) | ||
+ | Character(Len=255) | ||
+ | Integer | ||
+ | |||
+ | If (stride .le. 0) Then | ||
+ | Call GetEnv(' | ||
+ | Read(envvar, | ||
+ | If (use_stride .le. 0) use_stride = MaxStride8Byte * 2 | ||
+ | Else | ||
+ | use_stride = stride | ||
+ | End If | ||
+ | If (stride .gt. MaxStride8Byte * 2) use_stride = MaxStride8Byte * 2 | ||
+ | count_remain = count | ||
+ | i = 1 | ||
+ | mpierr = 0 | ||
+ | Do While (count_remain .gt. use_stride) | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | count_remain = count_remain - use_stride | ||
+ | i = i + use_stride | ||
+ | End Do | ||
+ | If (count_remain .gt. 0) Then | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | End If | ||
+ | End Subroutine BroadcastR | ||
+ | |||
+ | Subroutine BroadcastD(buffer, | ||
+ | Use mpi_f08 | ||
+ | Use, intrinsic :: iso_fortran_env | ||
+ | Implicit None | ||
+ | |||
+ | Type(MPI_Comm), | ||
+ | Integer, Intent(In) | ||
+ | Real(real64), | ||
+ | Integer, Intent(InOut) | ||
+ | Character(Len=255) | ||
+ | Integer | ||
+ | |||
+ | If (stride .le. 0) Then | ||
+ | Call GetEnv(' | ||
+ | Read(envvar, | ||
+ | If (use_stride .le. 0) use_stride = MaxStride8Byte | ||
+ | Else | ||
+ | use_stride = stride | ||
+ | End If | ||
+ | If (stride .gt. MaxStride8Byte) use_stride = MaxStride8Byte | ||
+ | count_remain = count | ||
+ | i = 1 | ||
+ | mpierr = 0 | ||
+ | Do While (count_remain .gt. use_stride) | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | count_remain = count_remain - use_stride | ||
+ | i = i + use_stride | ||
+ | End Do | ||
+ | If (count_remain .gt. 0) Then | ||
+ | Call MPI_Bcast(buffer(i: | ||
+ | If (mpierr .ne. 0 ) Then | ||
+ | Write(*,*) ' | ||
+ | Return | ||
+ | End If | ||
+ | End If | ||
+ | End Subroutine BroadcastD | ||
+ | | ||
+ | End Module | ||
+ | </ | ||
+ | The '' | ||
+ | |||
+ | Using this module, the previous code is transformed to: | ||
+ | <code fortran> | ||
+ | Use mpi_utils | ||
+ | : | ||
+ | Allocate(Rmat(2, | ||
+ | : | ||
+ | Call BroadcastR(Rmat, | ||
+ | </ | ||
+ | The data is now broadcast as a 268435456-element chunk followed by a 20084484-element chunk, both of which are well below the 1.0 GiB limit associated with the OFI MTL. The efficiency of MTL should be well in excess of the overhead involved in the " |