@article{shah_neff_wu_minutoli_tumeo_becchi_2022, title={Accelerating Random Forest Classification on GPU and FPGA}, ISSN={["0190-3918"]}, DOI={10.1145/3545008.3545067}, abstractNote={Random Forests (RFs) are a commonly used machine learning method for classification and regression tasks spanning a variety of application domains, including bioinformatics, business analytics, and software optimization. While prior work has focused primarily on improving performance of the training of RFs, many applications, such as malware identification, cancer prediction, and banking fraud detection, require fast RF classification. In this work, we accelerate RF classification on GPU and FPGA. In order to provide efficient support for large datasets, we propose a hierarchical memory layout suitable to the GPU/FPGA memory hierarchy. We design three RF classification code variants based on that layout, and we investigate GPU- and FPGA-specific considerations for these kernels. Our experimental evaluation, performed on an Nvidia Xp GPU and on a Xilinx Alveo U250 FPGA accelerator card using publicly available datasets on the scale of millions of samples and tens of features, covers various aspects. First, we evaluate the performance benefits of our hierarchical data structure over the standard compressed sparse row (CSR) format. Second, we compare our GPU implementation with cuML, a machine learning library targeting Nvidia GPUs. Third, we explore the performance/accuracy tradeoff resulting from the use of different tree depths in the RF. Finally, we perform a comparative performance analysis of our GPU and FPGA implementations. Our evaluation shows that, while reporting the best performance on GPU, our code variants outperform the CSR baseline both on GPU and FPGA. For high accuracy targets, our GPU implementation yields a 5-9 × speedup over CSR, and up to a 2 × speedup over Nvidia’s cuML library.}, journal={51ST INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING, ICPP 2022}, author={Shah, Milan and Neff, Reece and Wu, Hancheng and Minutoli, Marco and Tumeo, Antonino and Becchi, Michela}, year={2022} } @article{wu_becchi_2020, title={Evaluating Thread Coarsening and Low-cost Synchronization on Intel Xeon Phi}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS47924.2020.00108}, abstractNote={Manycore processors such as GPUs and Intel Xeon Phis have become popular due to their massive parallelism and high power-efficiency. To achieve optimal performance, it is necessary to optimize the use of the compute cores and of the memory system available on these devices. Previous work has proposed techniques to improve the use of the GPU resources. While Intel Phi can provide massive parallelism through their x86 cores and vector units, optimization techniques for these platforms have received less consideration.In this work, we study the benefits of thread coarsening and low-cost synchronization on applications running on Intel Xeon Phi processors and encoded in SIMT fashion. Specifically, we explore thread coarsening as a way to remap the work to the available cores and vector lanes. In addition, we propose low- overhead synchronization primitives, such as atomic operations and barriers, which transparently apply to threads mapped to the same or different VPUs and x86 cores. Finally, we consider the combined use of thread coarsening and our proposed synchronization primitives. We evaluate the effect of these techniques on the performance of two kinds of kernels: collaborative and non-collaborative ones, the former using scratchpad memory to explicitly control data sharing among threads. Our evaluation leads to the following results. First, while not always beneficial for non-collaborative kernels, thread coarsening improves the performance of collaborative kernels consistently by reducing the synchronization overhead. Second, our synchronization primitives outperform standard pthread APIs by a factor up to 8x in real-world benchmarks. Last, the combined use of the proposed techniques leads to performance improvements, especially for collaborative kernels.}, journal={2020 IEEE 34TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM IPDPS 2020}, author={Wu, Hancheng and Becchi, Michela}, year={2020}, pages={1018–1029} } @article{nourian_wu_becchi_2018, title={A Compiler Framework for Fixed-topology Non-deterministic Finite Automata on SIMD Platforms}, ISSN={["1521-9097"]}, DOI={10.1109/ICPADS.2018.00073}, journal={2018 IEEE 24TH INTERNATIONAL CONFERENCE ON PARALLEL AND DISTRIBUTED SYSTEMS (ICPADS 2018)}, author={Nourian, Marziyeh and Wu, Hancheng and Becchi, Michela}, year={2018}, pages={507–516} } @article{wu_ravi_becchi_2018, title={Compiling SIMT Programs on Multi- and Many-core Processors with Wide Vector Units: A Case Study with CUDA}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC.2018.00022}, abstractNote={Manycore processors and coprocessors with wide vector extensions, such as Intel Phi and Skylake devices, have become popular due to their high throughput capability. Performance optimization on these devices requires using both their x86-compatible cores and their vector units. While the x86-compatible cores can be programmed using traditional programming interfaces following the MIMD model, such as POSIX threads, MPI and OpenMP, the SIMD vector units are harder to program. The Intel software stack provides two approaches for code vectorization: automatic vectorization through the Intel compiler and manual vectorization through vector intrinsics. While the Intel compiler often fails to vectorize code with complex control flows and function calls, the manual approach is error-prone and leads to less portable code. Hence, there has been an increasing interest in SIMT programming tools allowing the simultaneous use of x86 cores and vector units while providing programmability and code portability. However, the effective implementation of the SIMT model on these hybrid architectures is not well understood. In this work, we target this problem. First, we propose a set of compiler techniques to transform programs written using a SIMT programming model (a subset of CUDA C) into code that leverages both the x86 cores and the vector units of a hybrid MIMD/SIMD architecture, thus providing programmability, high system utilization and performance. Second, we evaluate the proposed techniques on Xeon Phi and Skylake processors using micro-benchmarks and real-world applications. Third, we compare the resulting performance with that achieved by the same code on GPUs. Based on this analysis, we point out the main challenges in supporting the SIMT model on hybrid MIMD/SIMD architectures, while providing performance comparable to that of SIMT systems (e.g., GPUs).}, journal={2018 IEEE 25TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING (HIPC)}, author={Wu, Hancheng and Ravi, John and Becchi, Michela}, year={2018}, pages={123–132} }