@inproceedings{36c305818dc94f3eb9c1ca020487878d,
title = "Communication-Optimal Parallel Reservoir Sampling",
abstract = "When evaluating complex analytical queries on high-velocity data streams, many systems cannot run those queries on all elements of a stream. Sampling is a widely used method to reduce the system load by replacing the input with a representative yet manageable subset. For unbounded data, reservoir sampling generates a fixed-size uniform sample independent of the input cardinality. However, the collection of reservoir samples itself can already be a bottleneck for high-velocity data. In this paper, we introduce a technique that allows fully parallelizing reservoir sampling for many-core architectures. Our approach relies on the efficient combination of thread-local samples taken over chunks of the input without necessitating communication during the sampling phase and with minimal communication when merging. We show how our efficient merge guarantees uniform random samples while allowing data to be distributed over worker threads arbitrarily. Our analysis of this approach within the Umbra database system demonstrates linear scaling along the available threads and the ability to sustain high-velocity workloads.",
keywords = "Parallel Sampling, Reservoir Sampling, Stream Processing",
author = "Christian Winter and Moritz Sichert and Altan Birler and Thomas Neumann and Alfons Kemper",
note = "Publisher Copyright: {\textcopyright} 2023 Gesellschaft fur Informatik (GI). All rights reserved.; 2023 Datenbanksysteme fur Business, Technologie und Web, BTW 2023 - 2023 Database Systems for Business, Technology and Web, BTW 2023 ; Conference date: 06-03-2023 Through 10-03-2023",
year = "2023",
doi = "10.18420/BTW2023-27",
language = "English",
series = "Lecture Notes in Informatics (LNI), Proceedings - Series of the Gesellschaft fur Informatik (GI)",
publisher = "Gesellschaft fur Informatik (GI)",
pages = "567--578",
editor = "Birgitta Konig-Ries and Stefanie Scherzinger and Wolfgang Lehner and Gottfried Vossen",
booktitle = "Datenbanksysteme fur Business, Technologie und Web, BTW 2023",
}