@article{schabel_franzon_2018, title={Exploring the Tradeoffs of Application-Specific Processing}, volume={8}, ISSN={["2156-3357"]}, DOI={10.1109/JETCAS.2018.2849939}, abstractNote={Non-traditional processing schemes continue to grow in popularity as a means to achieve high performance with greater energy-efficiency. Data-centric processing is one such scheme that targets functional-specialization and memory bandwidth limitations, opening up small processors to wide memory IO. These functional-specific accelerators prove to be an essential component to achieve energy-efficiency and performance, but purely application-specific integrated circuit accelerators have expensive design overheads with limited reusability. We propose an architecture that combines existing processing schemes utilizing CGRAs for dynamic data path configuration as a means to add flexibility and reusability to data-centric acceleration. While flexibility adds a large energy overhead, performance can be regained through intelligent mappings to the CGRA for the functions of interest, while reusability can be gained through incrementally adding general purpose functionality to the processing elements. Building upon previous work accelerating sparse encoded neural networks, we present a CGRA architecture for mapping functional accelerators operating at 500 MHz in 32 nm. This architecture achieves a latency-per-function within $2{\times}$ of its function-specific counterparts with energy-per-operation increases between 21–188 $\times$ , and energy-per-area increases between 1.8–3.6 $\times$ .}, number={3}, journal={IEEE JOURNAL ON EMERGING AND SELECTED TOPICS IN CIRCUITS AND SYSTEMS}, author={Schabel, Joshua C. and Franzon, Paul D.}, year={2018}, month={Sep}, pages={531–542} } @inproceedings{srinivasan_chowdhury_forbes_widialaksono_zhang_schabel_ku_lipa_rotenberg_davis_et al._2017, title={H3 (heterogeneity in 3D): A logic-on-logic 3D-stacked heterogeneous multi-core processor}, DOI={10.1109/ICCD.2017.30}, abstractNote={A single-ISA heterogeneous multi-core processor(HMP) [2], [7] is comprised of multiple core types that all implement the same instruction-set architecture (ISA) but have different microarchitectures. Performance and energy is optimized by migrating a thread's execution among core types as its characteristics change. Simulation-based studies with two core types, one simple (low power) and the other complex (high performance), has shown that being able to switch cores as frequently as once every 1,000 instructions increases energy savings by 50% compared to switching cores once every 10,000 instructions, for the same target performance [10]. These promising results rely on extremely low latencies for thread migration. Here we present the H3 chip that uses 3D die stacking and novel microarchitecture to implement a heterogeneous multi-core processor (HMP) with low-latency fast thread migration capabilities. We discuss details of the H3 design and present power and performance results from running various benchmarks on the chip. The H3 prototype can reduce power consumption of benchmarks by up to 26%.}, booktitle={2017 IEEE International Conference on Computer Design (ICCD)}, author={Srinivasan, V. and Chowdhury, R. B. R. and Forbes, E. and Widialaksono, R. and Zhang, Z. Q. and Schabel, J. and Ku, S. and Lipa, S. and Rotenberg, E. and Davis, W. R. and et al.}, year={2017}, pages={145–152} } @inproceedings{schabel_baker_dey_li_franzon, title={Processor-in-memory support for artificial neural networks}, booktitle={2016 IEEE International Conference on Rebooting Computing (icrc)}, author={Schabel, J. and Baker, L. and Dey, S. and Li, W. F. and Franzon, P. D.} }