@article{nourian_zarch_becchi_2020, title={Optimizing Complex OpenCL Code for FPGA: A Case Study on Finite Automata Traversal}, ISSN={["1521-9097"]}, DOI={10.1109/ICPADS51040.2020.00073}, abstractNote={While FPGAs have been traditionally considered hard to program, recently there have been efforts aimed to allow the use of high-level programming models and libraries intended for multi-core CPUs and GPUs to program FPGAs. For example, both Intel and Xilinx are now providing toolchains to deploy OpenCL code onto FPGA. However, because the nature of the parallelism offered by GPU and FPGA devices is fundamentally different, OpenCL code optimized for GPU can prove very inefficient on FPGA, in terms of both performance and hardware resource utilization. This paper explores this problem on finite automata traversal. In particular, we consider an OpenCL NFA traversal kernel optimized for GPU but exhibiting FPGA-friendly characteristics, namely: limited memory requirements, lack of synchronization, and SIMD execution. We explore a set of structural code changes, custom and best-practice optimizations to retarget this code to FPGA. We showcase the effect of these optimizations on an Intel Stratix V FPGA board using various NFA topologies from different application domains. Our evaluation shows that, while the resource requirements of the original code exceed the capacity of the FPGA in use, our optimizations lead to significant resource savings and allow the transformed code to fit the FPGA for all considered NFA topologies. In addition, our optimizations lead to speedups up to 4x over an already optimized code-variant aimed to fit the NFA traversal kernel on FPGA. Some of the proposed optimizations can be generalized for other applications and introduced in OpenCL-to-FPGA compiler.}, journal={2020 IEEE 26TH INTERNATIONAL CONFERENCE ON PARALLEL AND DISTRIBUTED SYSTEMS (ICPADS)}, author={Nourian, Marziyeh and Zarch, Mostafa Eghbali and Becchi, Michela}, year={2020}, pages={518–527} } @article{roy_srivastava_grimm_nourian_becchi_aluru_2019, title={Evaluating High Performance Pattern Matching on the Automata Processor}, volume={68}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2019.2901466}, abstractNote={In this paper, we study the acceleration of applications that identify all the occurrences of thousands of string-patterns in an input data-stream using the Automata Processor (AP). For this evaluation, we use two applications from two fields, namely, cybersecurity and bioinformatics. The first application, called Fast-SNAP, scans network data for 4312 signatures of intrusion derived from the popular open-source Snort database. Using the resources of a single AP-board, Fast-SNAP can scan for all these signatures at 1 Gbps. The second application, called PROTOMATA, looks for all the occurrences of 1,309 motifs from the PROSITE database in protein sequences. PROTOMATA is up to 68 times faster than the state-of-the-art CPU implementation. As a comparison, we emulate the execution of the same NFAs by programming FPGAs using state-of-the-art techniques. We find that the performance derived by using the resources of a single AP-board, which houses 32 AP-chips, is comparable to that of the resources of five to six large FPGAs. The design techniques used in this paper are generic and may be applicable to the development of similar applications on the AP.}, number={8}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Roy, Indranil and Srivastava, Ankit and Grimm, Matt and Nourian, Marziyeh and Becchi, Michela and Aluru, Srinivas}, year={2019}, month={Aug}, pages={1201–1212} } @article{nourian_wu_becchi_2018, title={A Compiler Framework for Fixed-topology Non-deterministic Finite Automata on SIMD Platforms}, ISSN={["1521-9097"]}, DOI={10.1109/ICPADS.2018.00073}, journal={2018 IEEE 24TH INTERNATIONAL CONFERENCE ON PARALLEL AND DISTRIBUTED SYSTEMS (ICPADS 2018)}, author={Nourian, Marziyeh and Wu, Hancheng and Becchi, Michela}, year={2018}, pages={507–516} }