@article{srinivasan_chowdhury_rotenberg_2020, title={Slipstream Processors Revisited: Exploiting Branch Sets}, ISSN={["0884-7495"]}, DOI={10.1109/ISCA45697.2020.00020}, abstractNote={Delinquent branches and loads remain key performance limiters in some applications. One approach to mitigate them is pre-execution. Broadly, there are two classes of pre-execution: one class repeatedly forks small helper threads, each targeting an individual dynamic instance of a delinquent branch or load; the other class begins with two redundant threads in a leader-follower arrangement, and speculatively reduces the leading thread. The objective of this paper is to design a new pre-execution microarchitecture that meets four criteria: (i) retains the simpler coordination of a leader-follower microarchitecture, (ii) is fully automated with just hardware, (iii) targets both branches and loads, (iv) and is effective. We review prior preexecution proposals and show that none of them meet all four criteria. We develop Slipstream 2.0 to meet all four criteria. The key innovation in the space of leader-follower architectures is to remove the forward control-flow slices of delinquent branches and loads, from the leading thread. This innovation overcomes key limitations in the only other hardware-only leader-follower prior works: Slipstream and Dual Core Execution (DCE). Slipstream removes backward slices of confident branches to pre-execute unconfident branches, which is ineffective in phases dominated by unconfident branches when branch pre-execution is most needed. DCE is very effective at tolerating cache-missed loads, unless their dependent branches are mispredicted. Removing forward control-flow slices of delinquent branches and delinquent loads enables two firsts, respectively: (1) leader-follower-style branch pre-execution without relying on confident instruction removal, and (2) tolerance of cache-missed loads that feed mispredicted branches. For SPEC 2006/2017 SimPoints wherein Slipstream 2.0 is auto-enabled, it achieves geomean speedups of 67%, 60%, and 12%, over baseline (one core), Slipstream, and DCE.}, journal={2020 ACM/IEEE 47TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2020)}, author={Srinivasan, Vinesh and Chowdhury, Rangeen Basu Roy and Rotenberg, Eric}, year={2020}, pages={105–117} } @inproceedings{srinivasan_chowdhury_forbes_widialaksono_zhang_schabel_ku_lipa_rotenberg_davis_et al._2017, title={H3 (heterogeneity in 3D): A logic-on-logic 3D-stacked heterogeneous multi-core processor}, DOI={10.1109/ICCD.2017.30}, abstractNote={A single-ISA heterogeneous multi-core processor(HMP) [2], [7] is comprised of multiple core types that all implement the same instruction-set architecture (ISA) but have different microarchitectures. Performance and energy is optimized by migrating a thread's execution among core types as its characteristics change. Simulation-based studies with two core types, one simple (low power) and the other complex (high performance), has shown that being able to switch cores as frequently as once every 1,000 instructions increases energy savings by 50% compared to switching cores once every 10,000 instructions, for the same target performance [10]. These promising results rely on extremely low latencies for thread migration. Here we present the H3 chip that uses 3D die stacking and novel microarchitecture to implement a heterogeneous multi-core processor (HMP) with low-latency fast thread migration capabilities. We discuss details of the H3 design and present power and performance results from running various benchmarks on the chip. The H3 prototype can reduce power consumption of benchmarks by up to 26%.}, booktitle={2017 IEEE International Conference on Computer Design (ICCD)}, author={Srinivasan, V. and Chowdhury, R. B. R. and Forbes, E. and Widialaksono, R. and Zhang, Z. Q. and Schabel, J. and Ku, S. and Lipa, S. and Rotenberg, E. and Davis, W. R. and et al.}, year={2017}, pages={145–152} }