@inproceedings{6266b722a7744b7fb349d8a008dc98f7,
title = "MPI Runtime Error Detection with MUST: A Scalable and Crash-Safe Approach",
abstract = "The Message Passing Interface (MPI) is a widely used paradigm for distributed memory programming. Implementations of this interface are designed for good performance rather than on usability extensions that enforce their correct use. Runtime MPI usage error detection tools aid application developers in the correct use of this interface. Since usage errors can cause failures that lead to an application crash, it is crucial that runtime error detection tools employ techniques that allow them to finish all of their correctness checks. This includes situations in which the application is interrupted by the MPI library, due to an incorrect function call, and operating system signals after fatal errors like division by zero or faulty memory accesses. We present an approach that uses an alternative tool communication means along with signal and error handling capabilities. A study of the assumptions that enable this approach details its applicability for different use cases and compares it to less efficient schemes that rely on synchronous processing and/or communication. Additionally, we enable bandwidth efficient communication with a scalable propagation technique that raises the awareness of an application crash within the tool. An application study with the SPEC MPI2007 benchmark suite demonstrates the applicability of our approach for up to 2,048 processes. Overhead measurements underline that our application crash handling increases the runtime of our runtime error detection tool by only 4% in average.",
keywords = "MPI, crash safe, debugging, detection",
author = "Joachim Protze and Tobias Hilbrich and Martin Schulz and {De Supinski}, {Bronis R.} and Nagel, {Wolfgang E.} and M{\"u}ller, {Matthias S.}",
note = "Publisher Copyright: {\textcopyright} 2014 IEEE.; 43rd International Conference on Parallel Processing Workshops, ICPPW 2014 ; Conference date: 09-09-2014 Through 12-09-2014",
year = "2015",
month = may,
day = "7",
doi = "10.1109/ICPPW.2014.37",
language = "English",
series = "Proceedings of the International Conference on Parallel Processing Workshops",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "206--215",
booktitle = "Proceedings - 43rd International Conference on Parallel Processing Workshops, ICPPW 2014",
}