Scalable KNN Graph Construction for Heterogeneous Architectures. Ruys, W., Ghafouri, A., Chen, C., & Biros, G. ACM Transactions on Parallel Computing, 12(3):1–35, September, 2025.
Paper doi abstract bibtex Constructing k-nearest neighbor (kNN) graphs is a fundamental component in many machine learning and scientific computing applications. Despite its prevalence, efficiently building all-nearest-neighbor graphs at scale on distributed heterogeneous HPC systems remains challenging, especially for large sparse non-integer datasets. We introduce optimizations for algorithms based on forests of random projection trees. Our novel GPU kernels for batched, within leaf, exact searches achieve 1.18× speedup over sparse reference kernels with less peak memory, and up to 19× speedup over CPU for memory-intensive problems. Our library, PyRKNN , implements distributed randomized projection forests for approximate kNN search. Optimizations to reduce and hide communication overhead allow us to achieve 5× speedup, in per iteration performance, relative to GOFMM (another projection tree, MPI-based kNN library), for a 64M 128d dataset on 1,024 processes. On a single-node we achieve speedup over FAISS-GPU for dense datasets and up to 10× speedup over CPU-only libraries. PyRKNN uniquely supports distributed memory kNN graph construction for both dense and sparse coordinates on CPU and GPU accelerators.
@article{ruys_scalable_2025,
title = {Scalable {KNN} {Graph} {Construction} for {Heterogeneous} {Architectures}},
volume = {12},
issn = {2329-4949, 2329-4957},
url = {https://dl.acm.org/doi/10.1145/3733610},
doi = {10.1145/3733610},
abstract = {Constructing k-nearest neighbor (kNN) graphs is a fundamental component in many machine learning and scientific computing applications. Despite its prevalence, efficiently building all-nearest-neighbor graphs at scale on distributed heterogeneous HPC systems remains challenging, especially for large sparse non-integer datasets. We introduce optimizations for algorithms based on forests of random projection trees. Our novel GPU kernels for batched, within leaf, exact searches achieve 1.18× speedup over sparse reference kernels with less peak memory, and up to 19× speedup over CPU for memory-intensive problems. Our library,
PyRKNN
, implements distributed randomized projection forests for approximate kNN search. Optimizations to reduce and hide communication overhead allow us to achieve 5× speedup, in per iteration performance, relative to GOFMM (another projection tree, MPI-based kNN library), for a 64M 128d dataset on 1,024 processes. On a single-node we achieve speedup over FAISS-GPU for dense datasets and up to 10× speedup over CPU-only libraries.
PyRKNN
uniquely supports distributed memory kNN graph construction for both dense and sparse coordinates on CPU and GPU accelerators.},
language = {en},
number = {3},
urldate = {2025-08-28},
journal = {ACM Transactions on Parallel Computing},
author = {Ruys, William and Ghafouri, Ali and Chen, Chao and Biros, George},
month = sep,
year = {2025},
keywords = {Accelerated},
pages = {1--35},
}
Downloads: 0
{"_id":"KxnMbk6YwyqYQSqab","bibbaseid":"ruys-ghafouri-chen-biros-scalableknngraphconstructionforheterogeneousarchitectures-2025","author_short":["Ruys, W.","Ghafouri, A.","Chen, C.","Biros, G."],"bibdata":{"bibtype":"article","type":"article","title":"Scalable KNN Graph Construction for Heterogeneous Architectures","volume":"12","issn":"2329-4949, 2329-4957","url":"https://dl.acm.org/doi/10.1145/3733610","doi":"10.1145/3733610","abstract":"Constructing k-nearest neighbor (kNN) graphs is a fundamental component in many machine learning and scientific computing applications. Despite its prevalence, efficiently building all-nearest-neighbor graphs at scale on distributed heterogeneous HPC systems remains challenging, especially for large sparse non-integer datasets. We introduce optimizations for algorithms based on forests of random projection trees. Our novel GPU kernels for batched, within leaf, exact searches achieve 1.18× speedup over sparse reference kernels with less peak memory, and up to 19× speedup over CPU for memory-intensive problems. Our library, PyRKNN , implements distributed randomized projection forests for approximate kNN search. Optimizations to reduce and hide communication overhead allow us to achieve 5× speedup, in per iteration performance, relative to GOFMM (another projection tree, MPI-based kNN library), for a 64M 128d dataset on 1,024 processes. On a single-node we achieve speedup over FAISS-GPU for dense datasets and up to 10× speedup over CPU-only libraries. PyRKNN uniquely supports distributed memory kNN graph construction for both dense and sparse coordinates on CPU and GPU accelerators.","language":"en","number":"3","urldate":"2025-08-28","journal":"ACM Transactions on Parallel Computing","author":[{"propositions":[],"lastnames":["Ruys"],"firstnames":["William"],"suffixes":[]},{"propositions":[],"lastnames":["Ghafouri"],"firstnames":["Ali"],"suffixes":[]},{"propositions":[],"lastnames":["Chen"],"firstnames":["Chao"],"suffixes":[]},{"propositions":[],"lastnames":["Biros"],"firstnames":["George"],"suffixes":[]}],"month":"September","year":"2025","keywords":"Accelerated","pages":"1–35","bibtex":"@article{ruys_scalable_2025,\n\ttitle = {Scalable {KNN} {Graph} {Construction} for {Heterogeneous} {Architectures}},\n\tvolume = {12},\n\tissn = {2329-4949, 2329-4957},\n\turl = {https://dl.acm.org/doi/10.1145/3733610},\n\tdoi = {10.1145/3733610},\n\tabstract = {Constructing k-nearest neighbor (kNN) graphs is a fundamental component in many machine learning and scientific computing applications. Despite its prevalence, efficiently building all-nearest-neighbor graphs at scale on distributed heterogeneous HPC systems remains challenging, especially for large sparse non-integer datasets. We introduce optimizations for algorithms based on forests of random projection trees. Our novel GPU kernels for batched, within leaf, exact searches achieve 1.18× speedup over sparse reference kernels with less peak memory, and up to 19× speedup over CPU for memory-intensive problems. Our library,\n PyRKNN\n , implements distributed randomized projection forests for approximate kNN search. Optimizations to reduce and hide communication overhead allow us to achieve 5× speedup, in per iteration performance, relative to GOFMM (another projection tree, MPI-based kNN library), for a 64M 128d dataset on 1,024 processes. On a single-node we achieve speedup over FAISS-GPU for dense datasets and up to 10× speedup over CPU-only libraries.\n PyRKNN\n uniquely supports distributed memory kNN graph construction for both dense and sparse coordinates on CPU and GPU accelerators.},\n\tlanguage = {en},\n\tnumber = {3},\n\turldate = {2025-08-28},\n\tjournal = {ACM Transactions on Parallel Computing},\n\tauthor = {Ruys, William and Ghafouri, Ali and Chen, Chao and Biros, George},\n\tmonth = sep,\n\tyear = {2025},\n\tkeywords = {Accelerated},\n\tpages = {1--35},\n}\n\n\n\n","author_short":["Ruys, W.","Ghafouri, A.","Chen, C.","Biros, G."],"key":"ruys_scalable_2025","id":"ruys_scalable_2025","bibbaseid":"ruys-ghafouri-chen-biros-scalableknngraphconstructionforheterogeneousarchitectures-2025","role":"author","urls":{"Paper":"https://dl.acm.org/doi/10.1145/3733610"},"keyword":["Accelerated"],"metadata":{"authorlinks":{}}},"bibtype":"article","biburl":"https://bibbase.org/zotero-group/pratikmhatre/5933976","dataSources":["yJr5AAtJ5Sz3Q4WT4"],"keywords":["accelerated"],"search_terms":["scalable","knn","graph","construction","heterogeneous","architectures","ruys","ghafouri","chen","biros"],"title":"Scalable KNN Graph Construction for Heterogeneous Architectures","year":2025}