
@article{yzelman14b,
 author = "Yzelman, A. N. and Bisseling, R. H. and Roose, D. and Meerbergen, K.",
 title  = "{MulticoreBSP for C}: a high-performance library for shared-memory parallel programming",
 journal = "International Journal on Parallel Programming",
 volume = 42,
 issue = 4,
 year = "2014",
 pages = {619--642},
 publisher = "Springer",
 address = {Berlin, Germany},
 issn = {0885-7458},
 doi = {10.1007/s10766-013-0262-9},
 keywords = {High-performance computing; Bulk synchronous parallel; Shared-memory parallel programming; Software library; Fast Fourier transform; Sparse matrix–vector multiplication},
 abstract = "The bulk synchronous parallel ({BSP}) model, as well as parallel programming interfaces based on {BSP}, classically target distributed-memory parallel architectures. In earlier work, Yzelman and Bisseling designed a {MulticoreBSP} for Java library specifically for shared-memory architectures. In the present article, we further investigate this concept and introduce the new high-performance {MulticoreBSP for C} library. Among other features, this library supports nested {BSP} runs. We show that existing {BSP} software performs well regardless whether it runs on distributed-memory or shared-memory architectures, and show that applications in {MulticoreBSP} can attain high-performance results. The paper details implementing the {Fast Fourier Transform} and the sparse matrix--vector multiplication in {BSP}, both of which outperform state-of-the-art implementations written in other shared-memory parallel programming interfaces. We furthermore study the applicability of {BSP} when working on highly non-uniform memory access architectures."
}

