@article{zhou_subramanian_lin_fey_yin_li_2024, title={FASTEN: Fast GPU-accelerated Segmented Matrix Multiplication for Heterogeneous Graph Neural Networks}, DOI={10.1145/3650200.3656593}, abstractNote={This paper introduces FASTEN, a cutting-edge library developed to address the computational challenges inherent in Heterogeneous Graph Neural Networks (HGNNs). The key focus of FASTEN is the optimization of segmented matrix multiplication, a critical operator where existing GNN frameworks and linear algebra libraries often fall short. FASTEN offers an array of solutions to these challenges, including a routing table designed for efficient workload scheduling, adaptive algorithms tailored for handling segments of different shapes and segmented dimensions, and a performance model-guided autotuner to select the best configurations. Furthermore, FASTEN implements interfaces to integrate with widely-used frameworks like PyG, ensuring straightforward adoption in existing HGNN models with minimal adjustments. We have performed comprehensive benchmarks on advanced GPU architectures, including NVIDIA H100, A100, and RTX4090, to demonstrate that FASTEN significantly improves both operator-wise and end-to-end performance across various datasets and HGNNs.}, journal={PROCEEDINGS OF THE 38TH ACM INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ACM ICS 2024}, author={Zhou, Keren and Subramanian, Karthik Ganapathi and Lin, Po-Hsun and Fey, Matthias and Yin, Binqian and Li, Jiajia}, year={2024}, pages={511–524} }