
@inproceedings{boehnlein24a,
author = {B\"{o}hnlein, Toni and Papp, P\'{a}l Andr\'{a}s and Yzelman, A. N.},
title = {Brief Announcement: Red-Blue Pebbling with Multiple Processors: Time, Communication and Memory Trade-offs},
year = {2024},
isbn = {9798400704161},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3626183.3660269},
doi = {10.1145/3626183.3660269},
abstract = {The well-studied red-blue pebble game models the execution of an arbitrary computational DAG by a single processor over a two-level memory hierarchy. We present a natural generalization to a multiprocessor setting where each processor has its own limited fast memory, and all processors share unlimited slow memory. To our knowledge, this is the first thorough study that combines pebbling and DAG scheduling problems, capturing the computation of general workloads on multiple processors with memory constraints and communication costs. Our pebbling model enables us to analyze trade-offs between workload balancing, communication and memory limitations, and it captures real-world factors such as superlinear speedups due to parallelization. Our results include upper and lower bounds on the pebbling cost, an analysis of a greedy pebbling strategy, and an extension of NP-hardness results for specific DAG classes from simpler models. For our main technical contribution, we show two inapproximability results that already hold for the long-standing problem of standard red-blue pebbling: (i) the optimal I/O cost cannot be approximated to any finite factor, and (ii) the optimal total cost (I/O+computation) can only be approximated to a limited constant factor, i.e., it does not allow for a polynomial-time approximation scheme. These results also carry over naturally to our multiprocessor pebbling model.},
booktitle = {Proceedings of the 36th ACM Symposium on Parallelism in Algorithms and Architectures},
pages = {285–287},
numpages = {3},
keywords = {approximation, communication costs, limited memory, parallel computing, red-blue pebble game, scheduling},
location = {<conf-loc>, <city>Nantes</city>, <country>France</country>, </conf-loc>},
series = {SPAA '24}
}


@misc{pasadakis23a,
 author="Pasadakis, Dimosthenis and Schenk, Olaf and Vla\v{c}i\'{c}, Verner and Yzelman, A. N.",
 title="Nonlinear spectral clustering with {C++} {GraphBLAS}",
 note="Pre-print",
 year={2023},
 pages=2
}


@misc{suijlen19,
 author = {Suijlen, Wijnand and Yzelman, A. N.},
 title = {{L}ightweight {P}arallel {F}oundations: a model-compliant communication layer},
 year = {2019},
 eprint = {1906.03196},
 eprinttype = {arXiv},
 archivePrefix = {arXiv},
 primaryClass = {cs.DC},
 url={https://arxiv.org/abs/1906.03196}
}


@phdthesis{yzelman11b,
 author = "A. N. Yzelman",
 title = "Fast sparse matrix-vector multiplication by partitioning and reordering",
 school = "{U}trecht {U}niversity",
 address = "{U}trecht, the {N}etherlands",
 month = "October",
 year = 2011
}


@misc{yzelman20,
 author = "Yzelman, A. N. and Di Nardo, D. and Nash, J. M. and Suijlen, W. J.",
 title = "A {C++} {GraphBLAS}: specification, implementation, parallelisation, and evaluation",
 year = "2020",
 keywords = "graph algorithms",
 keywords = "linear algebra",
 keywords = "C++",
 keywords = "data-centric",
 keywords = "auto-parallelisation",
 keywords = "auto-vectorisation",
 keywords = "high-performance computing",
 keywords = "shared-memory",
 keywords = "distributed-memory",
 keywords = "cost modelling",
 abstract = "The {GraphBLAS} is a programming model that expresses graph algorithms in linear algebraic terms. It takes an easy-to-use, data-centric view where algebraic operations execute over sparse or dense vectors and sparse matrices, parametrised in the algebra the computation should proceed with. We present a {C++ GraphBLAS} interface designed with both shared- and distributed-memory systems in mind. Our specification employs {STL}-compatible generic programming principles. Templates and type traits allow for enhanced compile-time error handling and optimisation, even for user-defined types and operators. Our specification guides algorithm designers towards writing performant code by attaching performance semantics to each operation.

We detail a reference implementation that auto-vectorises and auto-parallelises user programs for sequential as well as shared- and distributed-memory parallel machines. We demonstrate the use of our {API} via two canonical graph algorithms, the k-nearest neighbourhood and {PageRank}. Weak and strong scaling experiments using real-world datasets with up to 42 billion edges show that the defined performance semantics are achievable in practice and capture key performance characteristics of the two selected algorithms. Microbenchmarks and throughput estimates from the PageRank algorithm on real-world graphs furthermore show that our reference implementation can extract near peak performance on different architectures.",
 url = "\url{http://albert-jan.yzelman.net/PDFs/yzelman20.pdf}"
}


@inproceedings{yzelman15a,
 author = "Yzelman, A. N.",
 title = "Generalised vectorisation for sparse matrix--vector multiplication",
 year = "2015",
 isbn = {9781450340014},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 doi = {10.1145/2833179.2833185},
 articleno = {6},
 numpages = {8},
 booktitle = {Proceedings of the 5th Workshop on Irregular Applications: Architectures and Algorithms},
 series = {IA3 '15},
 keywords = {cache-oblivious algorithms, segmented reduce, CRS, shared-memory parallel, CSR},
 keywords = "Sparse matrix--vector multiplication",
 keywords = "shared-memory parallelism",
 keywords = "cache-oblivious",
 keywords = "vectorisation",
 keywords = "{SIMD}",
 keywords = "high-performance computing",
 keywords = "{Intel Xeon Phi}",
 keywords ="Compressed Row Storage",
 keywords = "{ELLPACK}",
 keywords = "Compressed Sparse Rows",
 keywords = "Blocked {CRS}",
 keywords = "segmented scan",
 abstract = "This work generalises the various ways in which a sparse matrix--vector (SpMV) multiplication can be vectorised. It arrives at a novel data structure that generalises three earlier well-known data structures for sparse computations: the Blocked CRS format, the (sliced) ELLPACK format, and segmented scan based formats.
The new data structure is relevant for sparse computations on modern architectures, since efficient use of new hardware requires the use of increasingly wide vector registers. Normally, the use of vectorisation for sparse computations is limited due to bandwidth constraints. In cases where computations are limited by memory latencies instead of memory bandwidth, however, vectorisation can still help performance. The Intel Xeon Phi, appearing as a component in several top-500 supercomputers, displays exactly this behaviour for SpMV multiplication.
On this architecture, the use of the new generalised vectorisation scheme increases performance up to 178 percent."
}


@article{karypis_et_al:DagRep.13.8.1,
  author =	{Karypis, George and Schulz, Christian and Strash, Darren and Ajwani, Deepak and Bisseling, Rob H. and Casel, Katrin and \c{C}ataly\"{u}rek, \"{U}mit V. and Chevalier, C\'{e}dric and Chudigiewitsch, Florian and Faraj, Marcelo Fonseca and Fellows, Michael and Gottesb\"{u}ren, Lars and Heuer, Tobias and Kaya, Kamer and Lacki, Jakub and Langguth, Johannes and Li, Xiaoye Sherry and Mayer, Ruben and Meintrup, Johannes and Mizutani, Yosuke and Pellegrini, Fran\c{c}ois and Petrini, Fabrizio and Rosamond, Frances and Safro, Ilya and Schlag, Sebastian and Sharma, Roohani and Sullivan, Blair D. and U\c{c}ar, Bora and Yzelman, Albert-Jan N.},
  title =	{{Recent Trends in Graph Decomposition (Dagstuhl Seminar 23331)}},
  pages =	{1--45},
  journal =	{Dagstuhl Reports},
  ISSN =	{2192-5283},
  year =	{2024},
  volume =	{13},
  number =	{8},
  editor =	{Karypis, George and Schulz, Christian and Strash, Darren},
  publisher =	{Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
  address =	{Dagstuhl, Germany},
  URL =		{https://drops.dagstuhl.de/entities/document/10.4230/DagRep.13.8.1},
  URN =		{urn:nbn:de:0030-drops-198114},
  doi =		{10.4230/DagRep.13.8.1},
  annote =	{Keywords: combinatorial optimization, experimental algorithmics, parallel algorithms}
}


@incollection{yzelman12b,
 author = "Yzelman, A. N. and Bisseling, R. H.",
 title = "A Cache-Oblivious Sparse Matrix--Vector Multiplication Scheme Based on the {H}ilbert Curve",
 booktitle = "Progress in Industrial Mathematics at ECMI 2010",
 series = "Mathematics in Industry",
 editor = {G\"unther, Michael and Bartel, Andreas and Brunk, Markus and Sch\"ops, Sebastian and Striebel, Michael},
 pages = "627-633",
 isbn = "978-3-642-25099-6",
 url = "http://dx.doi.org/10.1007/978-3-642-25100-9_73",
 doi = "10.1007/978-3-642-25100-9_73",
 publisher = "Springer",
 address = {Berlin, Germany},
 language = "English",
 year = "2012",
 abstract = "The sparse matrix–vector (SpMV) multiplication is an important kernel in many applications. When the sparse matrix used is unstructured, however, standard SpMV multiplication implementations typically are inefficient in terms of cache usage, sometimes working at only a fraction of peak performance. Cache-aware algorithms take information on specifics of the cache architecture as a parameter to derive an efficient SpMV multiply. In contrast, cache-oblivious algorithms strive to obtain efficiency regardless of cache specifics. In earlier work in this latter area, Haase et al. (2007) use the Hilbert curve to order nonzeroes in the sparse matrix. They obtain speedup mainly when multiplying against multiple (up to eight) right-hand sides simultaneously. We improve on this by introducing a new datastructure, called Bi-directional Incremental Compressed Row Storage (BICRS). Using this datastructure to store the nonzeroes in Hilbert order, speedups of up to a factor two are attained for the SpMV multiplication y = Ax on sufficiently large, unstructured matrices."
}


@misc{yzelman15b,
 author = "Yzelman, A. N.",
 title = "High performance sparse computations applied to a parallel conjugate gradient solver",
 year = "2015",
 keywords = "Sparse matrix--vector multiplication",
 keywords = "shared-memory parallelism",
 keywords = "cache-oblivious",
 keywords = "sparse matrix partitioning",
 keywords = "matrix reordering",
 keywords = "Conjugate Gradient",
 keywords = "high-performance computing",
 keywords = "{NUMA} architectures",
 abstract = "Many advanced sparse matrix storage schemes have appeared in literature, demonstrating large gains in efficiency for basic operations such as the sparse matrix--vector ({SpMV}) multiplication. One class of methods, based on sparse matrix partitioning and reordering, sparse blocking, compression, and cache-oblivious nonzero traversals, gained two to three factors of increased performance on single {SpMV} multiplications, but had yet to be used in practice. This work explores its use within a {Conjugate Gradients (CG)} iterative solver for linear systems, and analyses the resulting algorithm using the {Bulk Synchronous Parallel (BSP)} paradigm. It is implemented using {C++11} and the high-performance {MulticoreBSP for C} programming framework, and tested on real-world problems on a shared-memory dual-socket architecture. Its performance is compared against {Blaze}, a {C++} library for high-performance numerical linear algebra. The {CG} solver developed using the {BSP} paradigm and advanced sparse computation techniques results in a total speedup of almost 10x, close to a 3x performance increase over {Blaze}.",
 url = "\url{http://albert-jan.yzelman.net/PDFs/yzelman15b-pp.pdf}"
}


@article{yzelman14b,
 author = "Yzelman, A. N. and Bisseling, R. H. and Roose, D. and Meerbergen, K.",
 title  = "{MulticoreBSP for C}: a high-performance library for shared-memory parallel programming",
 journal = "International Journal on Parallel Programming",
 volume = 42,
 issue = 4,
 year = "2014",
 pages = {619--642},
 publisher = "Springer",
 address = {Berlin, Germany},
 issn = {0885-7458},
 doi = {10.1007/s10766-013-0262-9},
 keywords = {High-performance computing; Bulk synchronous parallel; Shared-memory parallel programming; Software library; Fast Fourier transform; Sparse matrix–vector multiplication},
 abstract = "The bulk synchronous parallel ({BSP}) model, as well as parallel programming interfaces based on {BSP}, classically target distributed-memory parallel architectures. In earlier work, Yzelman and Bisseling designed a {MulticoreBSP} for Java library specifically for shared-memory architectures. In the present article, we further investigate this concept and introduce the new high-performance {MulticoreBSP for C} library. Among other features, this library supports nested {BSP} runs. We show that existing {BSP} software performs well regardless whether it runs on distributed-memory or shared-memory architectures, and show that applications in {MulticoreBSP} can attain high-performance results. The paper details implementing the {Fast Fourier Transform} and the sparse matrix--vector multiplication in {BSP}, both of which outperform state-of-the-art implementations written in other shared-memory parallel programming interfaces. We furthermore study the applicability of {BSP} when working on highly non-uniform memory access architectures."
}


@inproceedings{mastoras23a,
author = {Mastoras, Aristeidis and Yzelman, Albert-Jan N.},
title = {Studying the Expressiveness and Performance of Parallelization Abstractions for Linear Pipelines},
year = {2023},
isbn = {9798400701153},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3582514.3582522},
doi = {10.1145/3582514.3582522},
abstract = {Semi-automatic parallelization provides abstractions that simplify the programming effort and allow the user to make decisions that cannot be made by tools. However, abstractions for general-purpose systems usually do not carry sufficient knowledge about the structure of the program, and thus parallelization with them may lead to poor performance.In this paper, we present a popular class of programs, called linear pipelines, that cannot be easily and efficiently parallelized with general-purpose abstractions. We discuss the difficulties and inefficiencies of parallelizing linear pipelines with general-purpose abstractions, and we explain how pattern-specific abstractions overcome these problems. We present the properties of linear pipelines that should be described with pattern-specific abstractions and how these properties are exploited by the state of the art. In addition, we discuss the importance of exposing the performance parameters and how they are combined by pattern-specific knowledge. We claim that designing pattern-specific abstractions for general-purpose programming models is one way to simplify parallel programming and improve performance without sacrificing any expressive power. Consequently, we propose possible pattern-specific extensions to general-purpose parallel programming models, e.g., {OpenMP}, to support easy and efficient parallelization of linear pipelines.},
booktitle = {Proceedings of the 14th International Workshop on Programming Models and Applications for Multicores and Manycores},
pages = {29–38},
numpages = {10},
keywords = {linear pipelines, parallel programming models, performance parameters, parallelization abstractions, pattern-specific knowledge},
location = {Montreal, QC, Canada},
series = {PMAM'23}
}


@inproceedings{pawlowski20a,
 author = "Paw\l{l}owski, F. and U\c{c}ar, B. and Yzelman, A. N.",
 title = "High Performance Tensor--Vector Multiplication on Shared-Memory Systems",
 year = "2020",
 editor = "Wyrzykowski, Roman and Deelman, Ewa and Dongarra, Jack and Karczewski, Konrad",
 booktitle = "Parallel Processing and Applied Mathematics ({PPAM}) 2019",
 series = "Lecture Notes in Computer Science",
 volume = "12043",
 publisher = {Springer International Publishing},
 address = {New York, NY, USA},
 pages="38--48",
 isbn="978-3-030-43229-4",
 abstract="Tensor--vector multiplication is one of the core components in tensor computations. We have recently investigated high performance, single core implementation of this bandwidth-bound operation. Here, we investigate its efficient, shared-memory implementations. Upon carefully analyzing the design space, we implement a number of alternatives using OpenMP and compare them experimentally. Experimental results on up to 8 socket systems show near peak performance for the proposed algorithms."
}


@article{mastoras22a,
author = {Mastoras, Aristeidis and Anagnostidis, Sotiris and Yzelman, A. N.},
title = {Design and Implementation for Nonblocking Execution in {GraphBLAS}: Tradeoffs and Performance},
year = {2023},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = 20,
number = 1,
articleno = 6,
numpages = 23,
issn = {1544-3566},
url = {https://doi.org/10.1145/3561652},
doi = {10.1145/3561652},
abstract = {{GraphBLAS} is a recent standard that allows the expression of graph algorithms in the language of linear algebra and enables automatic code parallelization and optimization. GraphBLAS operations are memory bound and may benefit from data locality optimizations enabled by nonblocking execution. However, nonblocking execution remains under-evaluated. In this article, we present a novel design and implementation that investigates nonblocking execution in GraphBLAS. Lazy evaluation enables runtime optimizations that improve data locality, and dynamic data dependence analysis identifies operations that may reuse data in cache. The nonblocking execution of an arbitrary number of operations results in dynamic parallelism, and the performance of the nonblocking execution depends on two parameters, which are automatically determined, at run-time, based on a proposed analytic model. The evaluation confirms the importance of nonblocking execution for various matrices of three algorithms, by showing up to 4.11 \texttimes{} speedup over blocking execution as a result of better cache utilization. The proposed analytic model makes the nonblocking execution reach up to 5.13 \texttimes{} speedup over the blocking execution. The fully automatic performance is very close to that obtained by using the best manual configuration for both small and large matrices. Finally, the evaluation includes a comparison with other state-of-the-art frameworks for numerical linear algebra programming that employ parallel execution and similar optimizations to those discussed in this work, and the presented nonblocking execution reaches up to 16.1 \texttimes{} speedup over the state-of-the-art.},
journal = {ACM Trans. Archit. Code Optim.},
month = {March},
keywords = {data locality, nonblocking execution, loop fusion, loop tiling, dynamic data dependence analysis, dynamic parallelism, analytic performance model, lazy evaluation, GraphBLAS}
}


@article{yzelman11a,
 author = "Yzelman, A. N. and Bisseling, Rob H.",
 title = "Two-dimensional cache-oblivious sparse matrix--vector multiplication",
 journal = "Parallel Computing",
 volume = "37",
 number = "12",
 pages = "806 - 819",
 issn = "0167-8191",
 YEAR = 2011,
 publisher = {Elsevier},
 address = {Amsterdam, The Netherlands},
 url = "http://www.sciencedirect.com/science/article/pii/S0167819111001062",
 doi = "10.1016/j.parco.2011.08.004",
 keywords = "Matrix--vector multiplication",
 keywords = "Sparse matrix",
 keywords = "Parallel computing",
 keywords = "Recursive bipartitioning",
 keywords = "Fine-grain",
 keywords = "Cache-oblivious",
 abstract = "In earlier work, we presented a one-dimensional cache-oblivious sparse matrix–vector (SpMV) multiplication scheme which has its roots in one-dimensional sparse matrix partitioning. Partitioning is often used in distributed-memory parallel computing for the SpMV multiplication, an important kernel in many applications. A logical extension is to move towards using a two-dimensional partitioning. In this paper, we present our research in this direction, extending the one-dimensional method for cache-oblivious SpMV multiplication to two dimensions, while still allowing only row and column permutations on the sparse input matrix. This extension requires a generalisation of the compressed row storage data structure to a block-based data structure, for which several variants are investigated. Experiments performed on three different architectures show further improvements of the two-dimensional method compared to the one-dimensional method, especially in those cases where the one-dimensional method already provided significant gains. The largest gain obtained by our new reordering is over a factor of 3 in SpMV speed, compared to the natural matrix ordering."
}


@article{yzelman14a,
 author = "Yzelman, A. N. and Roose, D.",
 title = "High-level strategies for parallel shared-memory sparse matrix--vector multiplication",
 journal = "{IEEE} Transactions on Parallel and Distributed Systems",
 year = "2014",
 publisher = "{IEEE} Computer Society",
 address = "Los Alamitos, {CA}, {USA}",
 doi = "10.1109/TPDS.2013.31",
 volume = "25",
 number = "1",
 pages = "116-125",
 issn = "1045-9219",
 keywords = "Sparse matrix--vector multiplication",
 keywords = "shared-memory parallelism",
 keywords = "cache-oblivious",
 keywords = "sparse matrix partitioning",
 keywords = "matrix reordering",
 keywords = "Hilbert space-filling curve",
 keywords = "high-performance computing",
 keywords = "{NUMA} architectures",
 abstract = "The sparse matrix--vector multiplication is an important kernel, but is hard to efficiently execute even in the sequential case. The problems-- namely low arithmetic intensity, inefficient cache use, and limited memory bandwidth-- are magnified as the core count on shared-memory parallel architectures increases. Existing techniques are discussed in detail, and categorised chiefly based on their distribution types. Based on this new parallelisation techniques are proposed. The theoretical scalability and memory usage of the various strategies are analysed, and experiments on multiple NUMA architectures confirm the validity of the results. One of the newly proposed methods attains the best average result in experiments, in one of the experiments obtaining a parallel efficiency of 90 percent."
}


@inproceedings{papp24a,
author = {Papp, P\'{a}l Andr\'{a}s and Anegg, Georg and Karanasiou, Aikaterini and Yzelman, A. N.},
title = {Efficient Multi-Processor Scheduling in Increasingly Realistic Models},
year = {2024},
isbn = {9798400704161},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3626183.3659972},
doi = {10.1145/3626183.3659972},
abstract = {We study the problem of efficiently scheduling a computational DAG on multiple processors. The majority of previous works have developed and compared algorithms for this problem in relatively simple models; in contrast to this, we analyze this problem in a more realistic model that captures many real-world aspects, such as communication costs, synchronization costs, and the hierarchical structure of modern processing architectures. For this we extend the well-established BSP model of parallel computing with non-uniform memory access (NUMA) effects. We then develop a range of new scheduling algorithms to minimize the scheduling cost in this more complex setting: several initialization heuristics, a hill-climbing local search method, and several approaches that formulate (and solve) the scheduling problem as an Integer Linear Program (ILP). We combine these algorithms into a single framework, and conduct experiments on a diverse set of real-world computational DAGs to show that the resulting scheduler significantly outperforms both academic and practical baselines. In particular, even without NUMA effects, our scheduler finds solutions of 24\%-44\% smaller cost on average than the baselines, and in case of NUMA effects, it achieves up to a factor 2.5\texttimes{} improvement compared to the baselines. Finally, we also develop a multilevel scheduling algorithm, which provides up to almost a factor 5\texttimes{} improvement in the special case when the problem is dominated by very high communication costs.},
booktitle = {Proceedings of the 36th ACM Symposium on Parallelism in Algorithms and Architectures},
pages = {463–474},
numpages = {12},
keywords = {bsp model, dag scheduling, integer linear programming, multilevel scheduling},
location = {<conf-loc>, <city>Nantes</city>, <country>France</country>, </conf-loc>},
series = {SPAA '24}
}


@incollection{biss12,
 AUTHOR = "Bisseling, R. H. and Fagginger Auer, B. O. and Yzelman, A. N. and van Leeuwen, T. and {\c{C}}ataly{\"{u"}}rek, {\"{U}}. V.",
 TITLE = "Two-dimensional approaches to sparse matrix partitioning",
 BOOKTITLE = "Combinatorial Scientific Computing",
 EDITOR = "Naumann, U. and Schenk, O.",
 CHAPTER = "4",
 PAGES = "95--127",
 PUBLISHER = {CRC Press},
 ADDRESS = "{B}oca {R}aton, {FL}, {USA}",
 ISBN = "978-1-439-82735-2",
 URL = "http://www.crcpress.com/product/isbn/9781439827352",
 YEAR = 2012
}


@mastersthesis{yzelman07b,
 author = "Yzelman, A. N.",
 title = "{R}-Trees: an efficient structure for spatial data management",
 school = "Utrecht University",
 address = "Utrecht, the Netherlands",
 month = "August",
 year = "2007"
}


@misc{papp23a,
  doi = {10.48550/ARXIV.2303.05989},
  url = {https://arxiv.org/abs/2303.05989},
  author = {Papp, P\'{a}l Andr\'{a}s and Anegg, Georg and Yzelman, A. N.},
  keywords = {Computational Complexity (cs.CC), FOS: Computer and information sciences, FOS: Computer and information sciences, G.2.2; F.2.2, 68Q17, 68Q85, 05C20},
  title = {{DAG} Scheduling in the {BSP} Model},
  publisher = {arXiv},
  year = {2023},
  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{boehnlein24b,
      title={Red-Blue Pebbling with Multiple Processors: Time, Communication and Memory Trade-offs}, 
      author={Toni B\"{o}hnlein and P\'{a}l Andr\'{a}s Papp and A. N. Yzelman},
      year={2024},
      eprint={2409.03898},
      archivePrefix={arXiv},
      primaryClass={cs.DC},
      url={https://arxiv.org/abs/2409.03898}
}



@ARTICLE{yzelman09,
 AUTHOR = "Yzelman, A. N. and Bisseling, Rob H.",
 TITLE = "Cache-oblivious sparse matrix--vector multiplication by using sparse matrix partitioning methods",
 JOURNAL = "SIAM Journal on Scientific Computing",
 VOLUME = 31,
 NUMBER = 4,
 PAGES = "3128-3154",
 YEAR = 2009,
 PUBLISHER = {SIAM},
 ADDRESS = {Philadelphia, PA, USA},
 URL = "http://www.math.uu.nl/people/yzelman/publications/yzelman09.pdf",
 DOI = "10.1137/080733243",
 ABSTRACT = "In this article, we introduce a cache-oblivious method for sparse matrix--vector multiplication. Our method attempts to permute the rows and columns of the input matrix using a recursive hypergraph-based sparse matrix partitioning scheme so that the resulting matrix induces cache-friendly behavior during sparse matrix--vector multiplication. Matrices are assumed to be stored in row-major format, by means of the compressed row storage (CRS) or its variants incremental CRS and zig-zag CRS. The zig-zag CRS data structure is shown to fit well with the hypergraph metric used in partitioning sparse matrices for the purpose of parallel computation. The separated block-diagonal (SBD) form is shown to be the appropriate matrix structure for cache enhancement. We have implemented a run-time cache simulation library enabling us to analyze cache behavior for arbitrary matrices and arbitrary cache properties during matrix--vector multiplication within a k-way set-associative idealized cache model. The results of these simulations are then verified by actual experiments run on various cache architectures. In all these experiments, we use the Mondriaan sparse matrix partitioner in one-dimensional mode. The savings in computation time achieved by our matrix reorderings reach up to 50 percent, in the case of a large link matrix."
}


@incollection{yzelman14c,
	author = "Yzelman, A. N. and Roose, D.",
	title = "Sparse matrix computations on multi-core systems",
	booktitle = "{I}ntel European Exascale Labs report 2013",
	publisher = "{I}ntel",
	year = "2014",
	pages = "24-29",
	keywords = "Sparse matrix--vector multiplication",
	keywords = "shared-memory parallelism",
	keywords = "cache-oblivious",
	keywords = "Hilbert space-filling curve",
	keywords = "high-performance computing",
	keywords = "Intel Xeon Phi"
}


@misc{martinez22,
  author={Martinez-Ferrer, Pedro J. and Yzelman, A. N. and Beltran, Vicen\c{c}},
  journal={{IEEE} Transactions on Parallel and Distributed Systems}, 
  title={A Native Tensor--Vector Multiplication Algorithm for High Performance Computing}, 
  year={2022},
  volume=33,
  number=12,
  pages={3363-3374},
  doi={10.1109/TPDS.2022.3153113}
}


@inproceedings{papp22a,
 author = {Papp, P\'{a}l Andr\'{a}s and Anegg, Georg and Yzelman, A. N.},
 title = {Partitioning Hypergraphs is Hard: Models, Inapproximability, and Applications},
 year = {2023},
 isbn = {9781450395458},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {https://doi.org/10.1145/3558481.3591087},
 doi = {10.1145/3558481.3591087},
 abstract = {We study the balanced k-way hypergraph partitioning problem, with a special focus on its practical applications to manycore scheduling. Given a hypergraph on n nodes, our goal is to partition the node set into k parts of size at most (1 + ∈)· n over k each, while minimizing the cost of the partitioning, defined as the number of cut hyperedges, possibly also weighted by the number of partitions they intersect. We show that this problem cannot be approximated to within a n1 / poly log log n factor of the optimal solution in polynomial time if the Exponential Time Hypothesis holds, even for hypergraphs of maximal degree 2. We also study the hardness of the partitioning problem from a parameterized complexity perspective, and in the more general case when we have multiple balance constraints.Furthermore, we consider two extensions of the partitioning problem that are motivated from practical considerations. Firstly, we introduce the concept of hyperDAGs to model precedence-constrained computations as hypergraphs, and we analyze the adaptation of the balanced partitioning problem to this case. Secondly, we study the hierarchical partitioning problem to model hierarchical NUMA (non-uniform memory access) effects in modern computer architectures, and we show that ignoring this hierarchical aspect of the communication cost can yield significantly weaker solutions.},
 booktitle = {Proceedings of the 35th ACM Symposium on Parallelism in Algorithms and Architectures},
 pages = {415--425},
 numpages = {11},
 keywords = {balanced partitioning, parallel computing, hierarchical numa, approximation, hyperDAG, hypergraph},
 location = {Orlando, FL, USA},
 series = {SPAA '23}
}


@article{yzelman12a,
 author = "Yzelman, A. N. and Bisseling, Rob H.",
 title = "An object-oriented bulk synchronous parallel library for multicore programming",
 journal = {Concurrency and Computation: Practice and Experience},
 volume = {24},
 number = {5},
 publisher = {John Wiley \& Sons, Ltd},
 address = {Hoboken, NJ, USA},
 issn = {1532-0634},
 year = {2012},
 url = "http://dx.doi.org/10.1002/cpe.1843",
 doi = {10.1002/cpe.1843},
 pages={533-553},
 keywords = {bulk synchronous parallel, BSP, parallel computing, sparse matrix–vector multiplication, multicore, shared memory, fast Fourier transform, dense LU decomposition},
 abstract = "We show that the bulk synchronous parallel (BSP) model, originally designed for distributed-memory systems, is also applicable for shared-memory multicore systems and, furthermore, that BSP libraries are useful in scientific computing on these systems. A proof-of-concept MulticoreBSP library has been implemented in Java, and is used to show that BSP algorithms can attain proper speedups on multicore architectures. This library is based on the BSPlib implementation, adapted to an object-oriented setting. In comparison, the number of function primitives is reduced, while the overall design simplicity is improved. We detail applying the BSP model and library on the sparse matrix–vector (SpMV) multiplication problem, and show by performing numerical experiments that the resulting BSP SpMV algorithm attains speedups, in one case reaching a speedup of 3.5 for 4 threads. Whereas not described in detail in this paper, algorithms for the fast Fourier transform and the dense LU decomposition are also investigated; in one case, attaining superlinear speedups of 5 for 4 threads. The predictability of BSP algorithms in the case of the SpMV is also investigated."
}


@incollection{yzelman14d,
	author = "Yzelman, A. N. and Roose, D. and Meerbergen, K.",
	title = "Sparse matrix--vector multiplication: parallelization and vectorization",
	booktitle = "High Performance Parallelism Pearls: Multicore and Many-core Programming Approaches",
	publisher = "Elsevier",
	year = "2014",
	pages = "20",
	isbn = "978-01-280-2118-7",
	chapter = "27",
	editor = "Reinders, J. and Jeffers, J.",
	keywords = "Sparse matrix--vector multiplication",
	keywords = "shared-memory parallelism",
	keywords = "cache-oblivious",
	keywords = "Hilbert space-filling curve",
	keywords = "high-performance computing",
	keywords = "Intel Xeon Phi"
}


@article{pawlowski19,
title = "A multi-dimensional {M}orton-ordered block storage for mode-oblivious tensor computations",
journal = "{J}ournal of {C}omputational {S}cience",
volume = "33",
pages = "34 - 44",
year = "2019",
issn = "1877-7503",
doi = "https://doi.org/10.1016/j.jocs.2019.02.007",
url = "http://www.sciencedirect.com/science/article/pii/S187775031831130X",
author = "Paw\l{l}owski, Filip and U\c{c}ar, Bora and Yzelman, A.~N.",
keywords = "Tensor computations, Data structure, Morton order, Tensor--vector multiplication",
abstract = "Computation on tensors, treated as multidimensional arrays, revolve around generalized basic linear algebra subroutines (BLAS). We propose a novel data structure in which tensors are blocked and blocks are stored in an order determined by Morton order. This is not only proposed for efficiency reasons, but also to induce efficient performance regardless of which mode a generalized BLAS call is invoked for; we coin the term mode-oblivious to describe data structures and algorithms that induce such behavior. Experiments on one of the most bandwidth-bound generalized BLAS kernel, the tensorvector multiplication, not only demonstrate superior performance over two state-of-the-art variants by up to 18%, but additionally show that the proposed data structure induces a 71% less sample standard deviation for tensorvector multiplication across d modes, where d varies from 2 to 10. Finally, we show our data structure naturally expands to other tensor kernels and demonstrate up to 38% higher performance for the higher-order power method."
}


@inproceedings{spampinato23a,
 author = {Spampinato, Daniele G. and Jelovina, Denis and Zhuang, Jiawei and Yzelman, A. N.},
 title = {Towards Structured Algebraic Programming},
 year = {2023},
 isbn = {9798400701696},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {https://doi.org/10.1145/3589246.3595373},
 doi = {10.1145/3589246.3595373},
 abstract = {Structured matrices and tensors exhibiting properties such as symmetry and fixed non-zero patterns are known for making algorithms and data storage more efficient. Due to emerging power and efficiency constraints required by the scale of modern scientific, machine learning, and edge computing applications, algorithm experts are revisiting traditional linear algebra algorithms with the goal of making new structures appear. Such structures often result from new numerical approximations that would greatly benefit from a more flexible linear algebra interface than standard BLAS and LAPACK, allowing for mixed precision and data types to appear in place of traditional floating-point operations. Algebraic programming interfaces, like GraphBLAS, while naturally abstracting the algebras of their operations, do not explicitly capture structured, densely stored arrays. In this paper, we present a preliminary design of a new algebraic programming interface for structured containers with template-generic, non-zero patterns. This interface offers to backend implementations the possibility of integrating more compile-time pattern information in the loop-based implementation of primitive operations as well as in the formulation of array accesses. We demonstrate its ability to specify important dense matrix decomposition algorithms and argue its ability to expose high-performance backends.},
 booktitle = {Proceedings of the 9th ACM SIGPLAN International Workshop on Libraries, Languages and Compilers for Array Programming},
 pages = {50--61},
 numpages = {12},
 keywords = {algebraic programming, dense linear algebra, structured matrices, mathematical software},
 location = {Orlando, FL, USA},
 series = {ARRAY 2023}
}


@misc{yzelman07a,
 AUTHOR = "Yzelman, A. N.",
 TITLE = "Parallel Radiosity",
 NOTE = "{B}achelor's thesis",
 SCHOOL = "Utrecht University",
 ADDRESS = "Utrecht, the Netherlands",
 MONTH = "March",
 YEAR = "2007"
}


@inproceedings{evers12,
  title={Node counting in wireless ad-hoc networks},
  author={Evers, Joep and Kiss, Demeter and Kowalczyk, Wojtek and Navilarekallu, Tejaswi and Renger, Michiel and Sella, Lorenzo and Timperio, Vincent and Viorel, Adrian and van Wijk, Sandra},
  booktitle="Proceedings of the 79th European Study Group Mathematics with Industry",
  editor="Planqu\'e, Bob and Bhulai, Sandjai and Hulshof, Joost and Kager, Wouter and Rot, Thomas",
  organization="{SWI}",
  publisher="{VU} {U}niversity {A}msterdam",
  pages="49--73",
  year={2012}
}


@inproceedings{pawlowski20b,
 author = "Paw\l{}owski, F. and Bisseling, R. H. and U\c{c}ar, B. and Yzelman, A. N.",
 booktitle = {2020 IEEE High Performance Extreme Computing Conference (HPEC)},
 title="Combinatorial Tiling for Sparse Neural Networks",
 year="2020",
 pages={1-7},
 address="Waltham, MA, USA",
 doi={10.1109/HPEC43674.2020.9286154}
}


@article{yzelman22,
  author = {Yzelman, A.~N.},
  title = {Humble Heroes},
  journal = {Communications of Huawei Research},
  volume = 6,
  month = {June},
  year = {2024},
  pages = {146--170}
}


@inproceedings{mastoras22,
  author={Mastoras, Aristeidis and Anagnostidis, Sotiris and Yzelman, A. N.},
  booktitle={2022 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
  title={Nonblocking execution in {GraphBLAS}},
  year={2022},
  pages={230-233},
  doi={10.1109/IPDPSW55747.2022.00051}
}


@inproceedings{papp24a-brief,
  author={Papp, P\'{a}l Andr\'{a}s and Anegg, Georg and Karanasiou, Aikaterini and Yzelman, A. N.},
  booktitle={2024 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, 
  title={Efficient Multi-Processor Scheduling in Increasingly Realistic Models (Brief Summary)}, 
  year={2024},
  volume={},
  number={},
  pages={1169-1171},
  keywords={Distributed processing;Costs;Program processors;Scheduling algorithms;Computational modeling;Heuristic algorithms;Conferences;DAG scheduling;BSP model;Integer Linear Programming;Multilevel scheduling},
  doi={10.1109/IPDPSW63119.2024.00195}
}


@inproceedings{scolari23a,
  author={Scolari, Alberto and Yzelman, A. N.},
  booktitle={2023 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, 
  title={Effective implementation of the High Performance Conjugate Gradient benchmark on GraphBLAS}, 
  year={2023},
  volume={},
  number={},
  pages={216-225},
  keywords={Distributed processing;High performance computing;Semantics;Benchmark testing;Programming;Data structures;Hardware;HPCG;multigrid;red-black Gauss-Seidel;GraphBLAS},
  doi={10.1109/IPDPSW59300.2023.00044}
}

