
@inproceedings{pawlowski20a,
 author = "Paw\l{l}owski, F. and U\c{c}ar, B. and Yzelman, A. N.",
 title = "High Performance Tensor--Vector Multiplication on Shared-Memory Systems",
 year = "2020",
 editor = "Wyrzykowski, Roman and Deelman, Ewa and Dongarra, Jack and Karczewski, Konrad",
 booktitle = "Parallel Processing and Applied Mathematics ({PPAM}) 2019",
 series = "Lecture Notes in Computer Science",
 volume = "12043",
 publisher = {Springer International Publishing},
 address = {New York, NY, USA},
 pages="38--48",
 isbn="978-3-030-43229-4",
 abstract="Tensor--vector multiplication is one of the core components in tensor computations. We have recently investigated high performance, single core implementation of this bandwidth-bound operation. Here, we investigate its efficient, shared-memory implementations. Upon carefully analyzing the design space, we implement a number of alternatives using OpenMP and compare them experimentally. Experimental results on up to 8 socket systems show near peak performance for the proposed algorithms."
}

