@article{neff_minutoli_tumeo_becchi_2023, title={High-Level Synthesis of Irregular Applications: A Case Study on Influence Maximization}, DOI={10.1145/3587135.3592196}, abstractNote={FPGAs are promising platforms for accelerating irregular applications due to their ability to implement highly specialized hardware designs for each kernel. However, the design and implementation of FPGA-accelerated kernels can take several months using hardware design languages. High Level Synthesis (HLS) tools provide fast, high quality results for regular applications, but lack the support to effectively accelerate more irregular, complex workloads. This work analyzes the challenges and benefits of using a commercial state-of-the-art HLS tool and its available optimizations to accelerate graph sampling. We evaluate the resulting designs and their effectiveness when deployed in a state-of-the-art heterogeneous framework that implements the Influence Maximization with Martingales (IMM) algorithm, a complex graph analytics algorithm. We discuss future opportunities for improvement in hardware, HLS tools, and hardware/software co-design methodology to better support complex irregular applications such as IMM.}, journal={PROCEEDINGS OF THE 20TH ACM INTERNATIONAL CONFERENCE ON COMPUTING FRONTIERS 2023, CF 2023}, author={Neff, Reece and Minutoli, Marco and Tumeo, Antonino and Becchi, Michela}, year={2023}, pages={12–22} } @article{shah_neff_wu_minutoli_tumeo_becchi_2022, title={Accelerating Random Forest Classification on GPU and FPGA}, ISSN={["0190-3918"]}, DOI={10.1145/3545008.3545067}, abstractNote={Random Forests (RFs) are a commonly used machine learning method for classification and regression tasks spanning a variety of application domains, including bioinformatics, business analytics, and software optimization. While prior work has focused primarily on improving performance of the training of RFs, many applications, such as malware identification, cancer prediction, and banking fraud detection, require fast RF classification. In this work, we accelerate RF classification on GPU and FPGA. In order to provide efficient support for large datasets, we propose a hierarchical memory layout suitable to the GPU/FPGA memory hierarchy. We design three RF classification code variants based on that layout, and we investigate GPU- and FPGA-specific considerations for these kernels. Our experimental evaluation, performed on an Nvidia Xp GPU and on a Xilinx Alveo U250 FPGA accelerator card using publicly available datasets on the scale of millions of samples and tens of features, covers various aspects. First, we evaluate the performance benefits of our hierarchical data structure over the standard compressed sparse row (CSR) format. Second, we compare our GPU implementation with cuML, a machine learning library targeting Nvidia GPUs. Third, we explore the performance/accuracy tradeoff resulting from the use of different tree depths in the RF. Finally, we perform a comparative performance analysis of our GPU and FPGA implementations. Our evaluation shows that, while reporting the best performance on GPU, our code variants outperform the CSR baseline both on GPU and FPGA. For high accuracy targets, our GPU implementation yields a 5-9 × speedup over CSR, and up to a 2 × speedup over Nvidia’s cuML library.}, journal={51ST INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING, ICPP 2022}, author={Shah, Milan and Neff, Reece and Wu, Hancheng and Minutoli, Marco and Tumeo, Antonino and Becchi, Michela}, year={2022} } @article{zarch_neff_becchi_2021, title={Exploring Thread Coarsening on FPGA}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC53243.2021.00062}, abstractNote={Over the past few years, there has been an increased interest in including FPGAs in data centers and high-performance computing clusters along with GPUs and other accelerators. As a result, it has become increasingly important to have a unified, high-level programming interface for CPUs, GPUs and FPGAs. This has led to the development of compiler toolchains to deploy OpenCL code on FPGA. However, the fundamental architectural differences between GPUs and FPGAs have led to performance portability issues: it has been shown that OpenCL code optimized for GPU does not necessarily map well to FPGA, often requiring manual optimizations to improve performance. In this paper, we explore the use of thread coarsening - a compiler technique that consolidates the work of multiple threads into a single thread - on OpenCL code running on FPGA. While this optimization has been explored on CPU and GPU, the architectural features of FPGAs and the nature of the parallelism they offer lead to different performance considerations, making an analysis of thread coarsening on FPGA worthwhile. Our evaluation, performed on our microbenchmarks and on a set of applications from open-source benchmark suites, shows that thread coarsening can yield performance benefits (up to 3-4x speedups) to OpenCL code running on FPGA at a limited resource utilization cost.}, journal={2021 IEEE 28TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS (HIPC 2021)}, author={Zarch, Mostafa Eghbali and Neff, Reece and Becchi, Michela}, year={2021}, pages={436–441} }