Implementation Techniques for SPMD Kernels on CPUs
Meyer, J., Alpay, A., Hack, S., Fröning, H. and Heuveline, V.
Proceedings of the 2023 International Workshop on OpenCL, Association for Computing Machinery, 2023.
[doi]
[url]
[pdf]
[bib]
@inproceedings{Meyer:IWOCL23:10.1145/3585341.3585342,
author = {Meyer, Joachim and Alpay, Aksel and Hack, Sebastian and Fr\"oning, Holger and Heuveline, Vincent},
title = {Implementation Techniques for SPMD Kernels on CPUs},
year = {2023},
isbn = {9798400707452},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3585341.3585342},
webpdf = {https://dl.acm.org/doi/pdf/10.1145/3585341.3585342},
doi = {10.1145/3585341.3585342},
abstract = {More and more frameworks and simulations are developed using heterogeneous programming models such as OpenCL, SYCL, CUDA, or HIP. A significant hurdle to mapping these models to CPUs in a performance-portable manner is that implementing work-group barriers for such kernels requires providing forward-progress guarantees so that all work-items can reach the barrier. This work provides guidance for implementations of single-program multiple-data (SPMD) programming models, such as OpenCL, SYCL, CUDA, or HIP, on non-SPMD devices, such as CPUs. We discuss the trade-offs of multiple approaches to handling work-group-level barriers. We present our experience with the integration of two known compiler-based approaches for low-overhead work-group synchronization on CPUs. Thereby we discuss a general design flaw in deep loop fission approaches, as used in the popular Portable Computing Language (PoCL) project, that makes them miscompile certain kernels. For our evaluation, we integrate PoCL’s “loopvec” kernel compiler into hipSYCL and implement continuation-based synchronization (CBS) in the same. We compare both against hipSYCL’s library-only fiber implementation using diverse hardware: we use recent AMD Rome and Intel Icelake server CPUs but also two Arm server CPUs, namely Fujitsu’s A64FX and Marvell’s ThunderX2. We show that compiler-based approaches outperform library-only implementations by up to multiple orders of magnitude. Further, we adapt our CBS implementation into PoCL and compare it against its loopvec approach in both, PoCL and hipSYCL. We find that our implementation of CBS, while being more general than PoCL’s approach, gives comparable performance in PoCL and even surpasses it in hipSYCL. Therefore we recommend its use in general.},
booktitle = {Proceedings of the 2023 International Workshop on OpenCL},
articleno = {1},
numpages = {12},
keywords = {CPU, performance portability, barriers, OpenCL, compilation, synchronization, heterogeneous computing, SYCL},
location = {Cambridge, United Kingdom},
series = {IWOCL '23}
}
Evaluation of Modern GPGPU Technologies for Image Processing
Meyer, J.
Proceedings of the International Workshop on OpenCL, Association for Computing Machinery, 2020.
[doi]
[url]
[bib]
@inproceedings{Meyer:IWOCL20:10.1145/3388333.3388645,
author = {Meyer, Joachim},
title = {Evaluation of Modern GPGPU Technologies for Image Processing},
year = {2020},
isbn = {9781450375313},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3388333.3388645},
doi = {10.1145/3388333.3388645},
booktitle = {Proceedings of the International Workshop on OpenCL},
articleno = {2},
numpages = {2},
keywords = {CUDA, Vulkan, SYCL, OpenCL, Image Processing, GPGPU, Evaluation},
location = {Munich, Germany},
series = {IWOCL '20}
}