@article{zarch_becchi_2023, title={A Code Transformation to Improve the Efficiency of OpenCL Code on FPGA through Pipes}, DOI={10.1145/3587135.3592210}, abstractNote={Over the past few years, there has been an increased interest in using FPGAs alongside CPUs and GPUs in high-performance computing systems and data centers. This trend has led to a push toward the use of high-level programming models and libraries, such as OpenCL, both to lower the barriers to the adoption of FPGAs by programmers unfamiliar with hardware description languages, and to allow to deploy a single code on different devices seamlessly. Today, both Intel and Xilinx offer toolchains to compile OpenCL code onto FPGA. However, using OpenCL on FPGAs is complicated by performance portability issues, since different devices have fundamental differences in architecture and nature of hardware parallelism they offer. Hence, platform-specific optimizations are crucial to achieving good performance across devices. In this paper, we propose a code transformation to improve the performance of OpenCL codes running on FPGA. The proposed method uses pipes to separate the memory accesses and core computation within OpenCL kernels. We analyze the benefits of the approach as well as the restrictions to its applicability. Using OpenCL applications from popular benchmark suites, we show that this code transformation can result in higher utilization of the global memory bandwidth available and increased instruction concurrency, thus improving the overall throughput of OpenCL kernels at the cost of a modest resource utilization overhead. Further concurrency can be achieved by using multiple memory and compute kernels.}, journal={PROCEEDINGS OF THE 20TH ACM INTERNATIONAL CONFERENCE ON COMPUTING FRONTIERS 2023, CF 2023}, author={Zarch, Mostafa Eghbali and Becchi, Michela}, year={2023}, pages={101–111} } @article{ravi_byna_koziol_tang_becchi_2023, title={Evaluating Asynchronous Parallel I/O on HPC Systems}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS54959.2023.00030}, abstractNote={Parallel I/O is an effective method to optimize data movement between memory and storage for many scientific applications. Poor performance of traditional disk-based file systems has led to the design of I/O libraries which take advantage of faster memory layers, such as on-node memory, present in high-performance computing (HPC) systems. By allowing caching and prefetching of data for applications alternating computation and I/O phases, a faster memory layer also provides opportunities for hiding the latency of I/O phases by overlapping them with computation phases, a technique called asynchronous I/O. Since asynchronous parallel I/O in HPC systems is still in the initial stages of development, there hasn't been a systematic study of the factors affecting its performance.In this paper, we perform a systematic study of various factors affecting the performance and efficacy of asynchronous I/O, we develop a performance model to estimate the aggregate I/O bandwidth achievable by iterative applications using synchronous and asynchronous I/O based on past observations, and we evaluate the performance of the recently developed asynchronous I/O feature of a parallel I/O library (HDF5) using benchmarks and real-world science applications. Our study covers parallel file systems on two large-scale HPC systems: Summit and Cori, the former with a GPFS storage and the latter with a Lustre parallel file system.}, journal={2023 IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM, IPDPS}, author={Ravi, John and Byna, Suren and Koziol, Quincey and Tang, Houjun and Becchi, Michela}, year={2023}, pages={211–221} } @article{shah_yu_di_lykov_alexeev_becchi_cappello_2023, title={GPU-Accelerated Error-Bounded Compression Framework for Quantum Circuit Simulations}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS54959.2023.00081}, abstractNote={Quantum circuit simulations enable researchers to develop quantum algorithms without the need for a physical quantum computer. Quantum computing simulators, however, all suffer from significant memory footprint requirements, which prevents large circuits from being simulated on classical super-computers. In this paper, we explore different lossy compression strategies to substantially shrink quantum circuit tensors in the QTensor package (a state-of-the-art tensor network quantum circuit simulator) while ensuring the reconstructed data satisfy the user-needed fidelity.Our contribution is fourfold. (1) We propose a series of optimized pre- and post-processing steps to boost the compression ratio of tensors with a very limited performance overhead. (2) We characterize the impact of lossy decompressed data on quantum circuit simulation results, and leverage the analysis to ensure the fidelity of reconstructed data. (3) We propose a configurable compression framework for GPU based on cuSZ and cuSZx, two state-of-the-art GPU-accelerated lossy compressors, to address different use-cases: either prioritizing compression ratios or prioritizing compression speed. (4) We perform a comprehensive evaluation by running 9 state-of-the-art compressors on an NVIDIA A100 GPU based on QTensor-generated tensors of varying sizes. When prioritizing compression ratio, our results show that our strategies can increase the compression ratio nearly 10 times compared to using only cuSZ. When prioritizing throughput, we can perform compression at the comparable speed as cuSZx while achieving 3-4× higher compression ratios. Decompressed tensors can be used in QTensor circuit simulation to yield a final energy result within 1-5% of the true energy value.}, journal={2023 IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM, IPDPS}, author={Shah, Milan and Yu, Xiaodong and Di, Sheng and Lykov, Danylo and Alexeev, Yuri and Becchi, Michela and Cappello, Franck}, year={2023}, pages={757–767} } @article{neff_minutoli_tumeo_becchi_2023, title={High-Level Synthesis of Irregular Applications: A Case Study on Influence Maximization}, DOI={10.1145/3587135.3592196}, abstractNote={FPGAs are promising platforms for accelerating irregular applications due to their ability to implement highly specialized hardware designs for each kernel. However, the design and implementation of FPGA-accelerated kernels can take several months using hardware design languages. High Level Synthesis (HLS) tools provide fast, high quality results for regular applications, but lack the support to effectively accelerate more irregular, complex workloads. This work analyzes the challenges and benefits of using a commercial state-of-the-art HLS tool and its available optimizations to accelerate graph sampling. We evaluate the resulting designs and their effectiveness when deployed in a state-of-the-art heterogeneous framework that implements the Influence Maximization with Martingales (IMM) algorithm, a complex graph analytics algorithm. We discuss future opportunities for improvement in hardware, HLS tools, and hardware/software co-design methodology to better support complex irregular applications such as IMM.}, journal={PROCEEDINGS OF THE 20TH ACM INTERNATIONAL CONFERENCE ON COMPUTING FRONTIERS 2023, CF 2023}, author={Neff, Reece and Minutoli, Marco and Tumeo, Antonino and Becchi, Michela}, year={2023}, pages={12–22} } @article{shah_yu_di_becchi_cappello_2023, title={Lightweight Huffman Coding for Efficient GPU Compression}, DOI={10.1145/3577193.3593736}, abstractNote={Lossy compression is often deployed in scientific applications to reduce data footprint and improve data transfers and I/O performance. Especially for applications requiring on-the-flight compression, it is essential to minimize compression's runtime. In this paper, we design a scheme to improve the performance of cuSZ, a GPU-based lossy compressor. We observe that Huffman coding - used by cuSZ to compress metadata generated during compression - incurs a performance overhead that can be significant, especially for smaller datasets. Our work seeks to reduce the Huffman coding runtime with minimal-to-no impact on cuSZ's compression efficiency. Our contributions are as follows. First, we examine a variety of probability distributions to determine which distributions closely model the input to cuSZ's Huffman coding stage. From these distributions, we create a dictionary of pre-computed codebooks such that during compression, a codebook is selected from the dictionary instead of computing a custom codebook. Second, we explore three codebook selection criteria to be applied at runtime. Finally, we evaluate our scheme on real-world datasets and in the context of two important application use cases, HDF5 and MPI, using an NVIDIA A100 GPU. Our evaluation shows that our method can reduce the Huffman coding penalty by a factor of 78--92×, translating to a total speedup of up to 5× over baseline cuSZ. Smaller HDF5 chunk sizes enjoy over an 8× speedup in compression and MPI messages on the scale of tens of MB have a 1.4--30.5× speedup in communication time.}, journal={PROCEEDINGS OF THE 37TH INTERNATIONAL CONFERENCE ON SUPERCOMPUTING, ACM ICS 2023}, author={Shah, Milan and Yu, Xiaodong and Di, Sheng and Becchi, Michela and Cappello, Franck}, year={2023}, pages={99–110} } @article{ravi_byna_becchi_2023, title={Runway: In-transit Data Compression on Heterogeneous HPC Systems}, DOI={10.1109/CCGRID57682.2023.00030}, abstractNote={To alleviate bottlenecks in storing and accessing data on high-performance computing (HPC) systems, I/O libraries are enabling computation while data is in-transit, such as HDFS filters. For scientific applications that commonly use floating-point data, error-bounded lossy compression methods are a critical technique to significantly reduce the storage and bandwidth requirements. Thus far, deciding when and where to schedule in-transit data transformations, such as compression, has been outside the scope of I/O libraries. In this paper, we introduce Runway, a runtime framework that enables computation on in-transit data with an object storage abstraction. Runway is designed to be extensible to execute user-defined functions at runtime. In this effort, we focus on studying methods to offload data compression operations to available processing units based on latency and throughput. We compare the performance of running compression on multi-core CPUs, as well as offloading it to a GPU and a Data Processing Unit (DPU). We implement a state-of-the-art error-bounded lossy compression algorithm, SZ3, as a Runway function with a variant optimized for DPUs. We propose dynamic modeling to guide scheduling decisions for in-transit data compression. We evaluate Runway using four scientific datasets from the SDRBench benchmark suite on a the Perlmutter supercomputer at NERSC.}, journal={2023 IEEE/ACM 23RD INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND INTERNET COMPUTING, CCGRID}, author={Ravi, John and Byna, Suren and Becchi, Michela}, year={2023}, pages={229–239} } @article{ravi_byna_becchi_2023, title={Runway: In-transit Data Compression on Heterogeneous HPC Systems}, DOI={10.1109/CCGridW59191.2023.00078}, abstractNote={To alleviate bottlenecks in storing and accessing data on high-performance computing (HPC) systems, I/O libraries are enabling computation while data is in-transit, such as HDF5 filters. For scientific applications that commonly use floating-point data, error-bounded lossy compression methods are a critical technique to significantly reduce the storage and bandwidth requirements. Thus far, deciding when and where to schedule in-transit data transformations, such as compression, has been outside the scope of I/O libraries.In this paper, we introduce Runway, a runtime framework that enables computation on in-transit data with an object storage abstraction. Runway is designed to be extensible to execute user-defined functions at runtime. In this effort, we focus on studying methods to offload data compression operations to available processing units based on latency and throughput. We compare the performance of running compression on multi-core CPUs, as well as offloading it to a GPU and a Data Processing Unit (DPU). We implement a state-of-the-art error-bounded lossy compression algorithm, SZ3, as a Runway function with a variant optimized for DPUs. We propose dynamic modeling to guide scheduling decisions for in-transit data compression.}, journal={2023 IEEE/ACM 23RD INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND INTERNET COMPUTING WORKSHOPS, CCGRIDW}, author={Ravi, John and Byna, Suren and Becchi, Michela}, year={2023}, pages={340–342} } @article{nguyen_becchi_2022, title={A GPU-accelerated Data Transformation Framework Rooted in Pushdown Transducers}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC56025.2022.00038}, abstractNote={With the rise of machine learning and data analytics, the ability to process large and diverse sets of data efficiently has become crucial. Research has shown that data transformation is a key performance bottleneck for applications across a variety of domains, from data analytics to scientific computing. Custom hardware accelerators and GPU implementations targeting specific data transformation tasks can alleviate the problem, but suffer from narrow applicability and lack of generality.To tackle this problem, we propose a GPU-accelerated data transformation engine grounded on pushdown transducers. We define an extended pushdown transducer abstraction (effPDT) that allows expressing a wide range of data transformations in a memory-efficient fashion, and is thus amenable for GPU deployment. The effPDT execution engine utilizes a data streaming model that reduces the application’s memory requirements significantly, facilitating deployment on high- and low-end systems. We showcase our GPU-accelerated engine on a diverse set of transformation tasks covering data encoding/decoding, parsing and querying of structured data, and matrix transformation, and we evaluate it against publicly available CPU and GPU library implementations of the considered data transformation tasks. To understand the benefits of the effPDT abstraction, we extend our data transformation engine to also support finite state transducers (FSTs), we map the considered data transformation tasks on FSTs, and we compare the performance and resource requirements of the FST-based and the effPDT-based implementations.}, journal={2022 IEEE 29TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS, HIPC}, author={Nguyen, Tri and Becchi, Michela}, year={2022}, pages={215–225} } @article{shah_neff_wu_minutoli_tumeo_becchi_2022, title={Accelerating Random Forest Classification on GPU and FPGA}, ISSN={["0190-3918"]}, DOI={10.1145/3545008.3545067}, abstractNote={Random Forests (RFs) are a commonly used machine learning method for classification and regression tasks spanning a variety of application domains, including bioinformatics, business analytics, and software optimization. While prior work has focused primarily on improving performance of the training of RFs, many applications, such as malware identification, cancer prediction, and banking fraud detection, require fast RF classification. In this work, we accelerate RF classification on GPU and FPGA. In order to provide efficient support for large datasets, we propose a hierarchical memory layout suitable to the GPU/FPGA memory hierarchy. We design three RF classification code variants based on that layout, and we investigate GPU- and FPGA-specific considerations for these kernels. Our experimental evaluation, performed on an Nvidia Xp GPU and on a Xilinx Alveo U250 FPGA accelerator card using publicly available datasets on the scale of millions of samples and tens of features, covers various aspects. First, we evaluate the performance benefits of our hierarchical data structure over the standard compressed sparse row (CSR) format. Second, we compare our GPU implementation with cuML, a machine learning library targeting Nvidia GPUs. Third, we explore the performance/accuracy tradeoff resulting from the use of different tree depths in the RF. Finally, we perform a comparative performance analysis of our GPU and FPGA implementations. Our evaluation shows that, while reporting the best performance on GPU, our code variants outperform the CSR baseline both on GPU and FPGA. For high accuracy targets, our GPU implementation yields a 5-9 × speedup over CSR, and up to a 2 × speedup over Nvidia’s cuML library.}, journal={51ST INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING, ICPP 2022}, author={Shah, Milan and Neff, Reece and Wu, Hancheng and Minutoli, Marco and Tumeo, Antonino and Becchi, Michela}, year={2022} } @article{zarch_neff_becchi_2021, title={Exploring Thread Coarsening on FPGA}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC53243.2021.00062}, abstractNote={Over the past few years, there has been an increased interest in including FPGAs in data centers and high-performance computing clusters along with GPUs and other accelerators. As a result, it has become increasingly important to have a unified, high-level programming interface for CPUs, GPUs and FPGAs. This has led to the development of compiler toolchains to deploy OpenCL code on FPGA. However, the fundamental architectural differences between GPUs and FPGAs have led to performance portability issues: it has been shown that OpenCL code optimized for GPU does not necessarily map well to FPGA, often requiring manual optimizations to improve performance. In this paper, we explore the use of thread coarsening - a compiler technique that consolidates the work of multiple threads into a single thread - on OpenCL code running on FPGA. While this optimization has been explored on CPU and GPU, the architectural features of FPGAs and the nature of the parallelism they offer lead to different performance considerations, making an analysis of thread coarsening on FPGA worthwhile. Our evaluation, performed on our microbenchmarks and on a set of applications from open-source benchmark suites, shows that thread coarsening can yield performance benefits (up to 3-4x speedups) to OpenCL code running on FPGA at a limited resource utilization cost.}, journal={2021 IEEE 28TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS (HIPC 2021)}, author={Zarch, Mostafa Eghbali and Neff, Reece and Becchi, Michela}, year={2021}, pages={436–441} } @article{ravi_nguyen_zhou_becchi_2021, title={PILOT: a Runtime System to Manage Multi-tenant GPU Unified Memory Footprint}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC53243.2021.00063}, abstractNote={Concurrent kernel execution on GPU has proven an effective technique to improve system throughput by maximizing the resource utilization. In order to increase programmability and meet the increasing memory requirements of data-intensive applications, current GPUs support Unified Virtual Memory (UVM), which provides a virtual memory abstraction with demand paging. By allowing applications to oversubscribe GPU memory, UVM provides increased opportunities to share GPU resources across applications. However, in the presence of applications with competing memory requirements, GPU sharing can lead to performance degradation due to thrashing. NVIDIA's Multiple Process Service (MPS) offers the capability to space share bare metal GPUs, thereby enabling cluster workload managers, such as Slurm, to share a single GPU across MPI ranks with limited control over resource partitioning. However, it is not possible to preempt, schedule, or throttle a running GPU process through MPS. These features would enable new OS-managed scheduling policies to be implemented for GPU kernels to dynamically handle resource contention and offer consistent performance. The contribution of this paper is two-fold. We first show how memory oversubscription can impact the performance of concurrent GPU applications. Then, we propose three methods to transparently mitigate memory interference through kernel preemption and scheduling policies. To implement our policies, we develop our own runtime system (PILOT) to serve as an alternative to NVIDIA's MPS. In the presence of memory over-subscription, we noticed a dramatic improvement in the overall throughput when using our scheduling policies and runtime hints.}, journal={2021 IEEE 28TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS (HIPC 2021)}, author={Ravi, John and Nguyen, Tri and Zhou, Huiyang and Becchi, Michela}, year={2021}, pages={442–447} } @article{gu_beata_becchi_2020, title={A Loop-aware Autotuner for High-Precision Floating-point Applications}, DOI={10.1109/ISPASS48437.2020.00048}, abstractNote={Many scientific applications (e.g., molecular dynamics, climate modeling and astrophysical simulations) rely on floating-point arithmetic. Due to its approximate nature, the use of floating-point arithmetic can lead to inaccuracy and reproducibility issues, which can be particularly significant for long running applications. Indeed, previous work has shown that 64- bit IEEE floating-point arithmetic can be insufficient for many algorithms and applications, such as ill-conditioned linear systems, large summations, long-time or large-scale physical simulations, and experimental mathematics applications. To overcome these issues, existing work has proposed high-precision floating-point libraries (e.g., the GNU multiple precision arithmetic library), but these libraries come at the cost of significant execution time. In this work, we propose an auto-tuner for applications requiring high-precision floating-point arithmetic to deliver a prescribed level of accuracy. Our auto-tuner uses compiler analysis to discriminate operations and variables that require high-precision from those that can be handled using standard IEEE 64-bit floating-point arithmetic, and it generates a mixed precision program that trades off performance and accuracy by selectively using different precisions for different variables and operations. In particular, our auto-tuner leverages loop and data dependences analysis to quickly identify precision-sensitive variables and operations and provide results that are robust to different input datasets. We test our auto-tuner on a mix of applications with different computational patterns.}, journal={2020 IEEE INTERNATIONAL SYMPOSIUM ON PERFORMANCE ANALYSIS OF SYSTEMS AND SOFTWARE (ISPASS)}, author={Gu, Ruidong and Beata, Paul and Becchi, Michela}, year={2020}, pages={285–295} } @article{wu_becchi_2020, title={Evaluating Thread Coarsening and Low-cost Synchronization on Intel Xeon Phi}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS47924.2020.00108}, abstractNote={Manycore processors such as GPUs and Intel Xeon Phis have become popular due to their massive parallelism and high power-efficiency. To achieve optimal performance, it is necessary to optimize the use of the compute cores and of the memory system available on these devices. Previous work has proposed techniques to improve the use of the GPU resources. While Intel Phi can provide massive parallelism through their x86 cores and vector units, optimization techniques for these platforms have received less consideration.In this work, we study the benefits of thread coarsening and low-cost synchronization on applications running on Intel Xeon Phi processors and encoded in SIMT fashion. Specifically, we explore thread coarsening as a way to remap the work to the available cores and vector lanes. In addition, we propose low- overhead synchronization primitives, such as atomic operations and barriers, which transparently apply to threads mapped to the same or different VPUs and x86 cores. Finally, we consider the combined use of thread coarsening and our proposed synchronization primitives. We evaluate the effect of these techniques on the performance of two kinds of kernels: collaborative and non-collaborative ones, the former using scratchpad memory to explicitly control data sharing among threads. Our evaluation leads to the following results. First, while not always beneficial for non-collaborative kernels, thread coarsening improves the performance of collaborative kernels consistently by reducing the synchronization overhead. Second, our synchronization primitives outperform standard pthread APIs by a factor up to 8x in real-world benchmarks. Last, the combined use of the proposed techniques leads to performance improvements, especially for collaborative kernels.}, journal={2020 IEEE 34TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM IPDPS 2020}, author={Wu, Hancheng and Becchi, Michela}, year={2020}, pages={1018–1029} } @article{yu_wei_ou_becchi_bicer_yao_2020, title={GPU-Based Static Data-Flow Analysis for Fast and Scalable Android App Vetting}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS47924.2020.00037}, abstractNote={Many popular vetting tools for Android applications use static code analysis techniques. In particular, Interprocedural Data-Flow Graph (IDFG) construction is the computation at the core of Android static data-flow analysis and consumes most of the analysis time. Many analysis tools use a worklist algorithm, an iterative fixed-point approach, to construct the IDFG. In this paper, we observe that a straightforward GPU parallelization of the worklist algorithm leads to significant underutilization of the GPU resources. We identify four performance bottlenecks, namely, frequent dynamic memory allocations, high branch divergence, workload imbalance, and irregular memory access patterns. Accordingly, we propose GDroid, a GPU-based worklist algorithm implementation with multiple fine-grained optimizations tailored to common characteristics of Android applications. The optimizations considered are: matrix-based data structure, memory access-based node grouping, and worklist merging. Our experimental evaluation, performed on 1000 Android applications, shows that the proposed optimizations are beneficial to performance, and GDroid can achieve up to 128X speedups against a plain GPU implementation.}, journal={2020 IEEE 34TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM IPDPS 2020}, author={Yu, Xiaodong and Wei, Fengguo and Ou, Xinming and Becchi, Michela and Bicer, Tekin and Yao, Danfeng}, year={2020}, pages={274–284} } @article{gu_becchi_2020, title={GPU-FPtuner: Mixed-precision Auto-tuning for Floating-point Applications on GPU}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC50609.2020.00043}, abstractNote={GPUs have been extensively used to accelerate scientific applications from a variety of domains: computational fluid dynamics, astronomy and astrophysics, climate modeling, numerical analysis, to name a few. Many of these applications rely on floating-point arithmetic, which is approximate in nature. High-precision libraries have been proposed to mitigate accuracy issues due to the use of floating-point arithmetic. However, these libraries offer increased accuracy at a significant performance cost. Previous work, primarily focusing on CPU code and on standard IEEE floating-point data types, has explored mixed precision as a compromise between performance and accuracy. In this work, we propose a mixed precision autotuner for GPU applications that rely on floating-point arithmetic. Our tool supports standard 32- and 64-bit floating-point arithmetic, as well as high precision through the QD library. Our autotuner relies on compiler analysis to reduce the size of the tuning space. In particular, our tuning strategy takes into account code patterns prone to error propagation and GPU-specific considerations to generate a tuning plan that balances performance and accuracy. Our autotuner pipeline, implemented using the ROSE compiler and Python scripts, is fully automated and the code is available in open source. Our experimental results collected on benchmark applications with various code complexities show performance-accuracy tradeoffs for these applications and the effectiveness of our tool in identifying representative tuning points.}, journal={2020 IEEE 27TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS (HIPC 2020)}, author={Gu, Ruidong and Becchi, Michela}, year={2020}, pages={294–304} } @article{nourian_zarch_becchi_2020, title={Optimizing Complex OpenCL Code for FPGA: A Case Study on Finite Automata Traversal}, ISSN={["1521-9097"]}, DOI={10.1109/ICPADS51040.2020.00073}, abstractNote={While FPGAs have been traditionally considered hard to program, recently there have been efforts aimed to allow the use of high-level programming models and libraries intended for multi-core CPUs and GPUs to program FPGAs. For example, both Intel and Xilinx are now providing toolchains to deploy OpenCL code onto FPGA. However, because the nature of the parallelism offered by GPU and FPGA devices is fundamentally different, OpenCL code optimized for GPU can prove very inefficient on FPGA, in terms of both performance and hardware resource utilization. This paper explores this problem on finite automata traversal. In particular, we consider an OpenCL NFA traversal kernel optimized for GPU but exhibiting FPGA-friendly characteristics, namely: limited memory requirements, lack of synchronization, and SIMD execution. We explore a set of structural code changes, custom and best-practice optimizations to retarget this code to FPGA. We showcase the effect of these optimizations on an Intel Stratix V FPGA board using various NFA topologies from different application domains. Our evaluation shows that, while the resource requirements of the original code exceed the capacity of the FPGA in use, our optimizations lead to significant resource savings and allow the transformed code to fit the FPGA for all considered NFA topologies. In addition, our optimizations lead to speedups up to 4x over an already optimized code-variant aimed to fit the NFA traversal kernel on FPGA. Some of the proposed optimizations can be generalized for other applications and introduced in OpenCL-to-FPGA compiler.}, journal={2020 IEEE 26TH INTERNATIONAL CONFERENCE ON PARALLEL AND DISTRIBUTED SYSTEMS (ICPADS)}, author={Nourian, Marziyeh and Zarch, Mostafa Eghbali and Becchi, Michela}, year={2020}, pages={518–527} } @article{gu_becchi_2019, title={A Comparative Study of Parallel Programming Frameworks for Distributed GPU Applications}, DOI={10.1145/3310273.3323071}, abstractNote={Parallel programming frameworks such as MPI, OpenSHMEM, Charm++ and Legion have been widely used in many scientific domains (from bioinformatics, to computational physics, chemistry, among others) to implement distributed applications. While they have the same purpose, these frameworks differ in terms of programmability, performance, and scalability under different applications and cluster types. Hence, it is important for programmers to select the programming framework that is best suited to the characteristics of their application types (i.e. its computation and communication patterns) and the hardware setup of the target high-performance computing cluster. In this work, we consider several popular parallel programming frameworks for distributed applications. We first analyze their memory model, execution model, synchronization model and GPU support. We then compare their programmability, performance, scalability, and load-balancing capability on homogeneous computing cluster equipped with GPUs.}, journal={CF '19 - PROCEEDINGS OF THE 16TH ACM INTERNATIONAL CONFERENCE ON COMPUTING FRONTIERS}, author={Gu, Ruidong and Becchi, Michela}, year={2019}, pages={268–273} } @article{palumbo_becchi_2019, title={Editorial: Special Issue on Computing Frontiers}, volume={91}, ISSN={["1939-8115"]}, DOI={10.1007/s11265-019-1439-2}, number={3-4}, journal={JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNAL IMAGE AND VIDEO TECHNOLOGY}, author={Palumbo, Francesca and Becchi, Michela}, year={2019}, month={Mar}, pages={273–273} } @article{roy_srivastava_grimm_nourian_becchi_aluru_2019, title={Evaluating High Performance Pattern Matching on the Automata Processor}, volume={68}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2019.2901466}, abstractNote={In this paper, we study the acceleration of applications that identify all the occurrences of thousands of string-patterns in an input data-stream using the Automata Processor (AP). For this evaluation, we use two applications from two fields, namely, cybersecurity and bioinformatics. The first application, called Fast-SNAP, scans network data for 4312 signatures of intrusion derived from the popular open-source Snort database. Using the resources of a single AP-board, Fast-SNAP can scan for all these signatures at 1 Gbps. The second application, called PROTOMATA, looks for all the occurrences of 1,309 motifs from the PROSITE database in protein sequences. PROTOMATA is up to 68 times faster than the state-of-the-art CPU implementation. As a comparison, we emulate the execution of the same NFAs by programming FPGAs using state-of-the-art techniques. We find that the performance derived by using the resources of a single AP-board, which houses 32 AP-chips, is comparable to that of the resources of five to six large FPGAs. The design techniques used in this paper are generic and may be applicable to the development of similar applications on the AP.}, number={8}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Roy, Indranil and Srivastava, Ankit and Grimm, Matt and Nourian, Marziyeh and Becchi, Michela and Aluru, Srinivas}, year={2019}, month={Aug}, pages={1201–1212} } @article{nourian_wu_becchi_2018, title={A Compiler Framework for Fixed-topology Non-deterministic Finite Automata on SIMD Platforms}, ISSN={["1521-9097"]}, DOI={10.1109/ICPADS.2018.00073}, journal={2018 IEEE 24TH INTERNATIONAL CONFERENCE ON PARALLEL AND DISTRIBUTED SYSTEMS (ICPADS 2018)}, author={Nourian, Marziyeh and Wu, Hancheng and Becchi, Michela}, year={2018}, pages={507–516} } @article{wu_ravi_becchi_2018, title={Compiling SIMT Programs on Multi- and Many-core Processors with Wide Vector Units: A Case Study with CUDA}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC.2018.00022}, abstractNote={Manycore processors and coprocessors with wide vector extensions, such as Intel Phi and Skylake devices, have become popular due to their high throughput capability. Performance optimization on these devices requires using both their x86-compatible cores and their vector units. While the x86-compatible cores can be programmed using traditional programming interfaces following the MIMD model, such as POSIX threads, MPI and OpenMP, the SIMD vector units are harder to program. The Intel software stack provides two approaches for code vectorization: automatic vectorization through the Intel compiler and manual vectorization through vector intrinsics. While the Intel compiler often fails to vectorize code with complex control flows and function calls, the manual approach is error-prone and leads to less portable code. Hence, there has been an increasing interest in SIMT programming tools allowing the simultaneous use of x86 cores and vector units while providing programmability and code portability. However, the effective implementation of the SIMT model on these hybrid architectures is not well understood. In this work, we target this problem. First, we propose a set of compiler techniques to transform programs written using a SIMT programming model (a subset of CUDA C) into code that leverages both the x86 cores and the vector units of a hybrid MIMD/SIMD architecture, thus providing programmability, high system utilization and performance. Second, we evaluate the proposed techniques on Xeon Phi and Skylake processors using micro-benchmarks and real-world applications. Third, we compare the resulting performance with that achieved by the same code on GPUs. Based on this analysis, we point out the main challenges in supporting the SIMT model on hybrid MIMD/SIMD architectures, while providing performance comparable to that of SIMT systems (e.g., GPUs).}, journal={2018 IEEE 25TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING (HIPC)}, author={Wu, Hancheng and Ravi, John and Becchi, Michela}, year={2018}, pages={123–132} } @article{procter_harrison_graves_becchi_allwein_2017, title={A Principled Approach to Secure Multi-core Processor Design with ReWire}, volume={16}, ISSN={1539-9087}, url={http://dx.doi.org/10.1145/2967497}, DOI={10.1145/2967497}, abstractNote={There is no such thing as high assurance without high assurance hardware. High assurance hardware is essential because any and all high assurance systems ultimately depend on hardware that conforms to, and does not undermine, critical system properties and invariants. And yet, high assurance hardware development is stymied by the conceptual gap between formal methods and hardware description languages used by engineers. This article advocates a semantics-directed approach to bridge this conceptual gap. We present a case study in the design of secure processors, which are formally derived via principled techniques grounded in functional programming and equational reasoning. The case study comprises the development of secure single- and dual-core variants of a single processor, both based on a common semantic specification of the ISA. We demonstrate via formal equational reasoning that the dual-core processor respects a “no-write-down” information flow policy. The semantics-directed approach enables a modular and extensible style of system design and verification. The secure processors require only a very small amount of additional code to specify and implement, and their security verification arguments are concise and readable. Our approach rests critically on ReWire, a functional programming language providing a suitable foundation for formal verification of hardware designs. This case study demonstrates both ReWire’s expressiveness as a programming language and its power as a framework for formal, high-level reasoning about hardware systems.}, number={2}, journal={ACM Transactions on Embedded Computing Systems}, publisher={Association for Computing Machinery (ACM)}, author={Procter, Adam and Harrison, William L. and Graves, Ian and Becchi, Michela and Allwein, Gerard}, year={2017}, month={Jan}, pages={1–25} } @article{wu_becchi_2017, title={An Analytical Study of Recursive Tree Traversal Patterns on Multi- and Many-core Platforms}, ISSN={["1521-9097"]}, DOI={10.1109/ICPADS.2017.00082}, abstractNote={Recursive tree traversals are found in many application domains, such as data mining, graphics, machine learning and scientific simulations. In the past few years there has been growing interest in the deployment of applications based on graph data structures on many-core devices. A couple of recent efforts have focused on optimizing the execution of multiple serial tree traversals on GPU, and have reported performance trends that vary across algorithms. In this work, we aim to understand how to select the implementation and platform that is most suited to a given tree traversal algorithm and dataset. To this end, we perform a systematic study of recursive tree traversal on CPU, GPU and the Intel Phi processor. We first identify four tree traversal patterns: three of them performing multiple serial traversals concurrently, and the last one performing a single parallel level order traversal. For each of these patterns, we consider different code variants including existing and new optimization methods, and we characterize their control-flow and memory access patterns. We implement these code variants and evaluate them on CPU, GPU and Intel Phi. Our analysis shows that there is not a single code variant and platform that achieves the best performance on all tree traversal patterns, and it provides guidelines on the selection of the implementation most suited to a given tree traversal pattern and input dataset.}, journal={2017 IEEE 23RD INTERNATIONAL CONFERENCE ON PARALLEL AND DISTRIBUTED SYSTEMS (ICPADS)}, author={Wu, Hancheng and Becchi, Michela}, year={2017}, pages={586–595} } @inproceedings{surineni_gu_nguyen_becchi_2017, title={Understanding the performance-accuracy tradeoffs of floating-point arithmetic on GPUs}, DOI={10.1109/iiswc.2017.8167778}, abstractNote={Floating-point computations produce approximate results, possibly leading to inaccuracy and reproducibility problems. Existing work addresses two issues: first, the design of high precision floating-point representations; second, the study of methods to trade off accuracy and performance of CPU applications. However, a comprehensive study of the tradeoffs between accuracy and performance on modern GPUs is missing. This study covers the use of different floating-point precisions (i.e., single and double floating-point precision in IEEE 754 standard, GNU Multiple Precision, and composite floating-point precision) on GPU using a variety of synthetic and real-world benchmark applications. First, we analyze the support for single and double precision floating-point arithmetic on different GPU architectures, and we characterize the latencies of all floating-point instructions on GPU. Second, we study the performance/accuracy tradeoffs related to the use of different arithmetic precisions on addition, multiplication, division, and natural exponential function. Third, we analyze the combined use of different arithmetic operations on three benchmark applications characterized by different instruction mixes and arithmetic intensities. As a result of this analysis, we provide insights to guide users to the selection of the arithmetic precision leading to a good performance/accuracy tradeoff depending on the arithmetic operations and mathematical functions used in their program and the degree of multithreading of the code.}, booktitle={Proceedings of the 2017 ieee international symposium on workload characterization (iiswc)}, author={Surineni, S. and Gu, R. D. and Nguyen, H. and Becchi, M.}, year={2017}, pages={207–218} } @article{chen_jones_becchi_wolf_2016, title={Picking Pesky Parameters: Optimizing Regular Expression Matching in Practice}, volume={27}, ISSN={1045-9219}, url={http://dx.doi.org/10.1109/tpds.2015.2453986}, DOI={10.1109/tpds.2015.2453986}, abstractNote={Network security systems inspect packet payloads for signatures of attacks. These systems use regular expression matching at their core. Many techniques for implementing regular expression matching at line rate have been proposed. Solutions differ in the type of automaton used (i.e., deterministic versus non-deterministic) and in the configuration of implementation-specific parameters. While each solution has been shown to perform well on specific rule sets and traffic patterns, there has been no systematic comparison across a large set of solutions, rule sets and traffic patterns. Thus, it is extremely challenging for a practitioner to make an informed decision within the plethora of existing algorithmic and architectural proposals. Moreover, as multi-core processors are becoming popular, many parameters need to be tuned to maximize the multi-core potential. To address this problem, we present a comprehensive evaluation of a broad set of regular expression matching techniques. We consider both algorithmic and architectural aspects. Specifically, we explore the performance, area requirements, and power consumption of implementations targeting multi-core processors and FPGAs using rule sets of practical size and complexity. We present detailed performance results and specific guidelines for determining optimal configurations based on a simple evaluation of the rule set. These guidelines can help significantly when implementing regular expression matching systems in practice.}, number={5}, journal={IEEE Transactions on Parallel and Distributed Systems}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Chen, Xinming and Jones, Brandon and Becchi, Michela and Wolf, Tilman}, year={2016}, month={May}, pages={1430–1442} } @inbook{graves_procter_harrison_becchi_allwein_2015, title={Hardware Synthesis from Functional Embedded Domain-Specific Languages: A Case Study in Regular Expression Compilation}, ISBN={9783319162133 9783319162140}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-16214-0_4}, DOI={10.1007/978-3-319-16214-0_4}, abstractNote={Although FPGAs have the potential to bring software-like flexibility and agility to the hardware world, designing for FPGAs remains a difficult task divorced from standard software engineering norms. A better programming flow would go far towards realizing the potential of widely deployed, programmable hardware. We propose a general methodology based on domain specific languages embedded in the functional language Haskell to bridge the gap between high level abstractions that support programmer productivity and the need for high performance in FPGA circuit implementations. We illustrate this methodology with a framework for regular expression to hardware compilers, written in Haskell, that supports high programmer productivity while producing circuits whose performance matches and, indeed, exceeds that of a state of the art, hand-optimized VHDL-based tool. For example, after applying a novel optimization pass, throughput increased an average of $$28.3\,\%$$ over the state of the art tool for one set of benchmarks. All code discussed in the paper is available online [1].}, booktitle={Lecture Notes in Computer Science}, publisher={Springer International Publishing}, author={Graves, Ian and Procter, Adam and Harrison, William L. and Becchi, Michela and Allwein, Gerard}, year={2015}, pages={41–52} } @article{truong_li_sajjapongse_conant_becchi_2014, title={Large-Scale Pairwise Alignments on GPU Clusters: Exploring the Implementation Space}, volume={77}, ISSN={1939-8018 1939-8115}, url={http://dx.doi.org/10.1007/s11265-014-0883-2}, DOI={10.1007/s11265-014-0883-2}, number={1-2}, journal={Journal of Signal Processing Systems}, publisher={Springer Science and Business Media LLC}, author={Truong, Huan and Li, Da and Sajjapongse, Kittisak and Conant, Gavin and Becchi, Michela}, year={2014}, month={Apr}, pages={131–149} } @article{yu_lin_becchi_2014, title={Revisiting State Blow-Up: Automatically Building Augmented-FA While Preserving Functional Equivalence}, volume={32}, ISSN={0733-8716}, url={http://dx.doi.org/10.1109/jsac.2014.2358840}, DOI={10.1109/jsac.2014.2358840}, abstractNote={Regular expression matching, a central task in deep packet inspection and other networking applications, has been traditionally implemented through finite automata. Thanks to their limited per-character processing and memory bandwidth requirements, deterministic finite automata (DFA) are a natural choice for memory-based implementations. In the presence of large datasets of complex patterns, however, DFA suffer from the well-known state explosion problem. Specifically, state explosion can take place during DFA generation when the considered patterns contain bounded and unbounded repetitions of wildcards or large character sets. Several alternative FA representations have been proposed to address this problem. However, these proposals all suffer from one or more of the following problems: some can avoid state explosion only on datasets of limited size and complexity; some have prohibitive worst-case memory bandwidth requirements or processing time; and some can only guarantee functional equivalence for restricted classes of regular expressions and require the user to manually filter out unsupported patterns. In this work we propose JFA, a finite automation that uses state variables to avoid state explosion, and is functionally equivalent to the corresponding DFA. Functional equivalence is guaranteed by construction without requiring user intervention. We also provide optimization techniques to both limit the amount of state variables required and provide a lower bound for the JFA traversal time.}, number={10}, journal={IEEE Journal on Selected Areas in Communications}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Yu, Xiaodong and Lin, Bill and Becchi, Michela}, year={2014}, month={Oct}, pages={1822–1833} } @article{becchi_crowley_2013, title={A-DFA}, volume={10}, ISSN={1544-3566}, url={http://dx.doi.org/10.1145/2445572.2445576}, DOI={10.1145/2445572.2445576}, abstractNote={Modern network intrusion detection systems need to perform regular expression matching at line rate in order to detect the occurrence of critical patterns in packet payloads. While Deterministic Finite Automata (DFAs) allow this operation to be performed in linear time, they may exhibit prohibitive memory requirements. Kumar et al. [2006a] have proposed Delayed Input DFAs (D2FAs), which provide a trade-off between the memory requirements of the compressed DFA and the number of states visited for each character processed, which in turn affects the memory bandwidth required to evaluate regular expressions. In this article we introduce Amortized time − bandwidth overhead DFAs (A − DFAs), a general compression technique that results in at most N(k + 1)/k state traversals when processing a string of length N, k being a positive integer. In comparison to the D2FA approach, our technique achieves comparable levels of compression with lower provable bounds on memory bandwidth (or greater compression for a given bandwidth bound). Moreover, the A-DFA algorithm has lower complexity, can be applied during DFA creation, and is suitable for scenarios where a compressed DFA needs to be dynamically built or updated. Finally, we show how to combine A-DFA with alphabet reduction and multistride DFAs, two techniques aimed at reducing the memory space and bandwidth requirement of DFAs, and discuss memory encoding schemes suitable for A-DFAs.}, number={1}, journal={ACM Transactions on Architecture and Code Optimization}, publisher={Association for Computing Machinery (ACM)}, author={Becchi, Michela and Crowley, Patrick}, year={2013}, month={Apr}, pages={1–26} } @article{ellison_conant_cockrum_austin_truong_becchi_lamberson_cammack_2013, title={Diet Alters Both the Structure and Taxonomy of the Ovine Gut Microbial Ecosystem}, volume={21}, ISSN={1340-2838 1756-1663}, url={http://dx.doi.org/10.1093/dnares/dst044}, DOI={10.1093/dnares/dst044}, abstractNote={We surveyed the ruminal metagenomes of 16 sheep under two different diets using Illumina pair-end DNA sequencing of raw microbial DNA extracted from rumen samples. The resulting sequence data were bioinformatically mapped to known prokaryotic 16S rDNA sequences to identify the taxa present in the samples and then analysed for the presence of potentially new taxa. Strikingly, the majority of the microbial individuals found did not map to known taxa from 16S sequence databases. We used a novel statistical modelling approach to compare the taxonomic distributions between animals fed a forage-based diet and those fed concentrated grains. With this model, we found significant differences between the two groups both in the dominant taxa present in the rumen and in the overall shape of the taxa abundance curves. In general, forage-fed animals have a more diverse microbial ecosystem, whereas the concentrate-fed animals have ruminal systems more heavily dominated by a few taxa. As expected, organisms from methanogenic groups are more prevalent in forage-fed animals. Finally, all of these differences appear to be grounded in an underlying common input of new microbial individuals into the rumen environment, with common organisms from one feed group being present in the other, but at much lower abundance.}, number={2}, journal={DNA Research}, publisher={Oxford University Press (OUP)}, author={Ellison, M. J. and Conant, G. C. and Cockrum, R. R. and Austin, K. J. and Truong, H. and Becchi, M. and Lamberson, W. R. and Cammack, K. M.}, year={2013}, month={Oct}, pages={115–125} } @inbook{poostchi_palaniappan_bunyak_becchi_seetharaman_2013, title={Efficient GPU Implementation of the Integral Histogram}, ISBN={9783642374098 9783642374104}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-37410-4_23}, DOI={10.1007/978-3-642-37410-4_23}, abstractNote={The integral histogram for images is an efficient preprocessing method for speeding up diverse computer vision algorithms including object detection, appearance-based tracking, recognition and segmentation. Our proposed Graphics Processing Unit (GPU) implementation uses parallel prefix sums on row and column histograms in a cross-weave scan with high GPU utilization and communication-aware data transfer between CPU and GPU memories. Two different data structures and communication models were evaluated. A 3-D array to store binned histograms for each pixel and an equivalent linearized 1-D array, each with distinctive data movement patterns. Using the 3-D array with many kernel invocations and low workload per kernel was inefficient, highlighting the necessity for careful mapping of sequential algorithms onto the GPU. The reorganized 1-D array with a single data transfer to the GPU with high GPU utilization, was 60 times faster than the CPU version for a 1K ×1K image reaching 49 fr/sec and 21 times faster for 512×512 images reaching 194 fr/sec. The integral histogram module is applied as part of the likelihood of features tracking (LOFT) system for video object tracking using fusion of multiple cues.}, booktitle={Computer Vision - ACCV 2012 Workshops}, publisher={Springer Berlin Heidelberg}, author={Poostchi, Mahdieh and Palaniappan, Kannappan and Bunyak, Filiz and Becchi, Michela and Seetharaman, Guna}, year={2013}, pages={266–278} } @article{ravi_becchi_jiang_agrawal_chakradhar_2013, title={Scheduling concurrent applications on a cluster of CPU–GPU nodes}, volume={29}, ISSN={0167-739X}, url={http://dx.doi.org/10.1016/j.future.2013.06.002}, DOI={10.1016/j.future.2013.06.002}, abstractNote={Heterogeneous architectures comprising a multi-core CPU and many-core GPU(s) are increasingly being used within cluster and cloud environments. In this paper, we study the problem of optimizing the overall throughput of a set of applications deployed on a cluster of such heterogeneous nodes. We consider two different scheduling formulations. In the first formulation, we consider jobs that can be executed on either the GPU or the CPU of a single node. In the second formulation, we consider jobs that can be executed on the CPU, GPU, or both, of any number of nodes in the system. We have developed scheduling schemes addressing both of the problems. In our evaluation, we first show that the schemes proposed for first formulation outperform a blind round-robin scheduler and approximate the performances of an ideal scheduler that involves an impractical exhaustive exploration of all possible schedules. Next, we show that the scheme proposed for the second formulation outperforms the best of existing schemes for heterogeneous clusters, TORQUE and MCT, by up to 42%. Additionally, we evaluate the robustness of our proposed scheduling policies under inaccurate inputs to account for real execution scenarios. We show that, with up to 20% of inaccuracy in the input, the degradation in performance is marginal (less than 7%) on the average.}, number={8}, journal={Future Generation Computer Systems}, publisher={Elsevier BV}, author={Ravi, Vignesh T. and Becchi, Michela and Jiang, Wei and Agrawal, Gagan and Chakradhar, Srimat}, year={2013}, month={Oct}, pages={2262–2271} } @article{majumdar_cadambi_becchi_chakradhar_graf_2012, title={A Massively Parallel, Energy Efficient Programmable Accelerator for Learning and Classification}, volume={9}, ISSN={1544-3566}, url={http://dx.doi.org/10.1145/2133382.2133388}, DOI={10.1145/2133382.2133388}, abstractNote={Applications that use learning and classification algorithms operate on large amounts of unstructured data, and have stringent performance constraints. For such applications, the performance of general purpose processors scales poorly with data size because of their limited support for fine-grained parallelism and absence of software-managed caches. The large intermediate data in these applications also limits achievable performance on many-core processors such as GPUs. To accelerate such learning applications, we present a programmable accelerator that can execute multiple learning and classification algorithms. To architect such an accelerator, we profile five representative workloads, and find that their computationally intensive portions can be formulated as matrix or vector operations generating large amounts of intermediate data, which are then reduced by a secondary operation such as array ranking, finding max/min and aggregation. Our proposed accelerator, called MAPLE, has hundreds of simple processing elements (PEs) laid out in a two-dimensional grid, with two key features. First, it uses dynamic in-memory processing where on-chip memory blocks perform the secondary reduction operations. Second, MAPLE uses banked off-chip memory, and organizes its PEs into independent groups each with its own off-chip memory bank. These two features allow MAPLE to scale its performance with data size. We also present an Atom based energy-efficient heterogeneous system with MAPLE as the accelerator that satisfies the application’s performance requirements at a lower system power. This article describes the MAPLE architecture, explores its design space with a simulator, illustrates how to automatically map application kernels to the hardware, and presents its performance improvement and energy benefits over classic server-based implementations. We implement a 512-PE FPGA prototype of MAPLE and find that it is 1.5-10x faster than a 2.5 GHz quad-core Xeon processor despite running at a modest 125 MHz clock rate. With MAPLE connected to a 1.6GHz dual-core Atom, we show an energy improvement of 38-84% over the Xeon server coupled to a 1.3 GHz 240 core Tesla GPU.}, number={1}, journal={ACM Transactions on Architecture and Code Optimization}, publisher={Association for Computing Machinery (ACM)}, author={Majumdar, Abhinandan and Cadambi, Srihari and Becchi, Michela and Chakradhar, Srimat T. and Graf, Hans Peter}, year={2012}, month={Mar}, pages={1–30} } @article{pang_zhao_becchi_korkin_shyu_2012, title={Accelerating large-scale protein structure alignments with graphics processing units}, volume={5}, ISSN={1756-0500}, url={http://dx.doi.org/10.1186/1756-0500-5-116}, DOI={10.1186/1756-0500-5-116}, abstractNote={Abstract Background Large-scale protein structure alignment, an indispensable tool to structural bioinformatics, poses a tremendous challenge on computational resources. To ensure structure alignment accuracy and efficiency, efforts have been made to parallelize traditional alignment algorithms in grid environments. However, these solutions are costly and of limited accessibility. Others trade alignment quality for speedup by using high-level characteristics of structure fragments for structure comparisons. Findings We present ppsAlign , a p arallel p rotein s tructure Align ment framework designed and optimized to exploit the parallelism of Graphics Processing Units (GPUs). As a general-purpose GPU platform, ppsAlign could take many concurrent methods, such as TM-align and Fr-TM-align, into the parallelized algorithm design. We evaluated ppsAlign on an NVIDIA Tesla C2050 GPU card, and compared it with existing software solutions running on an AMD dual-core CPU. We observed a 36-fold speedup over TM-align, a 65-fold speedup over Fr-TM-align, and a 40-fold speedup over MAMMOTH. Conclusions ppsAlign is a high-performance protein structure alignment tool designed to tackle the computational complexity issues from protein structural data. The solution presented in this paper allows large-scale structure comparisons to be performed using massive parallel computing power of GPU.}, number={1}, journal={BMC Research Notes}, publisher={Springer Nature}, author={Pang, Bin and Zhao, Nan and Becchi, Michela and Korkin, Dmitry and Shyu, Chi-Ren}, year={2012}, pages={116} } @article{becchi_crowley_2008, title={Dynamic Thread Assignment on Heterogeneous Multiprocessor Architectures}, volume={10}, journal={The Journal of Instruction-Level Parallelism (JILP)}, author={Becchi, M. and Crowley, P.}, year={2008}, month={Jun} }