@article{franzon_davis_rotenberg_stevens_lipa_nigussie_pan_baker_schabel_dey_et al._2021, title={Design for 3D Stacked Circuits}, ISSN={["2380-9248"]}, DOI={10.1109/IEDM19574.2021.9720553}, abstractNote={2.5D and 3D technologies can give rise to a node equivalent of scaling due to improved connectivity. Aggressive exploitation scenarios include functional partitioning, circuit partitioning, logic on DRAM, design obfuscation and modular chiplets. Design issues that need to be addressed in pursuing such exploitations include thermal management, design for test and computer aided design.}, journal={2021 IEEE INTERNATIONAL ELECTRON DEVICES MEETING (IEDM)}, author={Franzon, P. and Davis, W. and Rotenberg, E. and Stevens, J. and Lipa, S. and Nigussie, T. and Pan, H. and Baker, L. and Schabel, J. and Dey, S. and et al.}, year={2021} } @article{kumar_chaudhary_bhawalkar_mathur_jain_vastrad_rotenberg_2020, title={Post-Silicon Microarchitecture}, volume={19}, ISSN={["1556-6064"]}, DOI={10.1109/LCA.2020.2978841}, abstractNote={Microprocessors are designed to provide good general performance across a range of benchmarks. As such, microarchitectural techniques which provide good speedup for only a small subset of applications are not attractive when designing a general-purpose core. We propose coupling a reconfigurable fabric with the CPU, on the same chip, via a simple and flexible interface to allow post-silicon development of application-specific microarchitectures. The interface supports observation and intervention at key pipeline stages of the CPU, so that exotic microarchitecture designs (with potentially narrow applicability) can be synthesized in the reconfigurable fabric and seem like components that were hardened into the core.}, number={1}, journal={IEEE COMPUTER ARCHITECTURE LETTERS}, author={Kumar, Chanchal and Chaudhary, Aayush and Bhawalkar, Shubham and Mathur, Utkarsh and Jain, Saransh and Vastrad, Adith and Rotenberg, Eric}, year={2020}, pages={26–29} } @article{srinivasan_chowdhury_rotenberg_2020, title={Slipstream Processors Revisited: Exploiting Branch Sets}, ISSN={["0884-7495"]}, DOI={10.1109/ISCA45697.2020.00020}, abstractNote={Delinquent branches and loads remain key performance limiters in some applications. One approach to mitigate them is pre-execution. Broadly, there are two classes of pre-execution: one class repeatedly forks small helper threads, each targeting an individual dynamic instance of a delinquent branch or load; the other class begins with two redundant threads in a leader-follower arrangement, and speculatively reduces the leading thread. The objective of this paper is to design a new pre-execution microarchitecture that meets four criteria: (i) retains the simpler coordination of a leader-follower microarchitecture, (ii) is fully automated with just hardware, (iii) targets both branches and loads, (iv) and is effective. We review prior preexecution proposals and show that none of them meet all four criteria. We develop Slipstream 2.0 to meet all four criteria. The key innovation in the space of leader-follower architectures is to remove the forward control-flow slices of delinquent branches and loads, from the leading thread. This innovation overcomes key limitations in the only other hardware-only leader-follower prior works: Slipstream and Dual Core Execution (DCE). Slipstream removes backward slices of confident branches to pre-execute unconfident branches, which is ineffective in phases dominated by unconfident branches when branch pre-execution is most needed. DCE is very effective at tolerating cache-missed loads, unless their dependent branches are mispredicted. Removing forward control-flow slices of delinquent branches and delinquent loads enables two firsts, respectively: (1) leader-follower-style branch pre-execution without relying on confident instruction removal, and (2) tolerance of cache-missed loads that feed mispredicted branches. For SPEC 2006/2017 SimPoints wherein Slipstream 2.0 is auto-enabled, it achieves geomean speedups of 67%, 60%, and 12%, over baseline (one core), Slipstream, and DCE.}, journal={2020 ACM/IEEE 47TH ANNUAL INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA 2020)}, author={Srinivasan, Vinesh and Chowdhury, Rangeen Basu Roy and Rotenberg, Eric}, year={2020}, pages={105–117} } @inproceedings{ku_forbes_chowdhury_rotenberg_2017, title={A case for standard-cell based RAMs in highly-ported superscalar processor structures}, DOI={10.1109/isqed.2017.7918305}, abstractNote={Highly-ported memories are pervasive within superscalar processors. Accordingly, they have been targets for full-custom design using multi-ported versions of the 6T SRAM bitcell. Unfortunately, full-custom design of highly-ported memories is becoming exceedingly difficult in deep sub-micron technologies. This paper makes the case for implementing highly-ported memories with standard cells (flip-flops, muxes, clock buffers). In lieu of exotic peripheral circuits for each port, standard-cell SRAMs use muxes. Consequently, area differences between full-custom and standard-cell designs are greatly reduced at a high number of ports. To also compete with full-custom memories in terms of timing and power, we introduce a standard-cell memory compiler with three key features: (i) per-row clock gating, (ii) a new tri-state based mux standard cell, and (iii) a modular layout strategy, which is the centerpiece of the memory compiler. For a 16-read/8-write 128-entry register file, our modular standard-cell memory consumes 13% more area and 4% more power, and is 35% faster, than the custom memory produced by FabMem. The automatic (built-in) robustness of standard cell designs further weigh in their favor, contrasted with exquisite transistor sizing/tuning of intertwined sub-circuits in a full-custom design.}, booktitle={Proceedings of the eighteenth international symposium on quality electronic design (isqed)}, author={Ku, S. and Forbes, E. and Chowdhury, R. B. R. and Rotenberg, E.}, year={2017}, pages={131–137} } @inproceedings{srinivasan_chowdhury_forbes_widialaksono_zhang_schabel_ku_lipa_rotenberg_davis_et al._2017, title={H3 (heterogeneity in 3D): A logic-on-logic 3D-stacked heterogeneous multi-core processor}, DOI={10.1109/ICCD.2017.30}, abstractNote={A single-ISA heterogeneous multi-core processor(HMP) [2], [7] is comprised of multiple core types that all implement the same instruction-set architecture (ISA) but have different microarchitectures. Performance and energy is optimized by migrating a thread's execution among core types as its characteristics change. Simulation-based studies with two core types, one simple (low power) and the other complex (high performance), has shown that being able to switch cores as frequently as once every 1,000 instructions increases energy savings by 50% compared to switching cores once every 10,000 instructions, for the same target performance [10]. These promising results rely on extremely low latencies for thread migration. Here we present the H3 chip that uses 3D die stacking and novel microarchitecture to implement a heterogeneous multi-core processor (HMP) with low-latency fast thread migration capabilities. We discuss details of the H3 design and present power and performance results from running various benchmarks on the chip. The H3 prototype can reduce power consumption of benchmarks by up to 26%.}, booktitle={2017 IEEE International Conference on Computer Design (ICCD)}, author={Srinivasan, V. and Chowdhury, R. B. R. and Forbes, E. and Widialaksono, R. and Zhang, Z. Q. and Schabel, J. and Ku, S. and Lipa, S. and Rotenberg, E. and Davis, W. R. and et al.}, year={2017}, pages={145–152} } @inproceedings{chowdhury_kannepalli_rotenberg_2016, title={AnyCore-1: A comprehensively adaptive 4-way superscalar processor}, DOI={10.1109/hotchips.2016.7936237}, abstractNote={This article consists only of a collection of slides from the author's conference presentation.}, booktitle={2016 ieee hot chips 28 symposium (hcs)}, author={Chowdhury, R. B. R. and Kannepalli, A. K. and Rotenberg, E.}, year={2016} } @inproceedings{chowdhury_kannepalli_ku_rotenberg_2016, title={AnyCore: A synthesizable RTL model for exploring and fabricating adaptive superscalar cores}, DOI={10.1109/ispass.2016.7482096}, abstractNote={Adaptive superscalar cores have the ability to dynamically adjust their execution resources to match the instruction-level parallelism (ILP) of different program phases. The goal of adaptivity is to maximize performance in as energy-efficient a manner as possible. This is achieved by disabling execution resources that contribute only marginally to performance for the code at hand. Researchers have proposed many adaptive features, including structures, superscalar width, and pipeline depth. The benefits of adaptivity are eroded by its circuit-level overheads. Unfortunately, circuit-level overheads cannot be effectively estimated or appreciated without a hardware design. To this end, we developed a register-transfer-level (RTL) design of a highly adaptive superscalar core, called AnyCore. AnyCore can be used to quantify logic overheads of an adaptive core with respect to fixed cores, synthesize and compare different adaptive cores, synthesize and compare an adaptive core to a multi-core comprised of multiple fixed core types, and fabricate adaptive superscalar cores. We provide examples of these use-cases.}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Chowdhury, R. B. R. and Kannepalli, A. K. and Ku, S. and Rotenberg, E.}, year={2016}, pages={214–224} } @inproceedings{forbes_rotenberg_2016, title={Fast register consolidation and migration for heterogeneous multi-core processors}, DOI={10.1109/iccd.2016.7753254}, abstractNote={Single-ISA heterogeneous multi-core processors have been demonstrated to improve the performance and efficiency of general-purpose workloads. However, these designs leave some performance on the table due to the common assumption that the cost of migrating a program from one core to another is high. This high cost is due to the reliance on the operating system for a migration via a context switch. Many programs exhibit very fine-grained changes in behavior. A high-cost thread migration requires infrequent migrations, as the migration penalty must be amortized. In this paper, we investigate the impact that thread migrations impose on single-ISA heterogeneous systems.}, booktitle={Proceedings of the 34th ieee international conference on computer design (iccd)}, author={Forbes, E. and Rotenberg, E.}, year={2016}, pages={1–8} } @inproceedings{widialaksono_basu roy chowdhury_zhang_schabel_lipa_rotenberg_rhett davis_franzon_2016, title={Physical design of a 3D-stacked heterogeneous multi-core processor}, ISBN={9781509013999}, url={http://dx.doi.org/10.1109/3dic.2016.7970036}, DOI={10.1109/3dic.2016.7970036}, abstractNote={With the end of Dennard scaling, three dimensional stacking has emerged as a promising integration technique to improve microprocessor performance. In this paper we present a 3D-SIC physical design methodology for a multi-core processor using commercial off-the-shelf tools. We explain the various flows involved and present the lessons learned during the design process. The logic dies were fabricated with GlobalFoundries 130 nm process and were stacked using the Ziptronix face-to-face (F2F) bonding technology. We also present a comparative analysis which highlights the benefits of 3D integration. Results indicate an order of magnitude decrease in wirelengths for critical inter-core components in the 3D implementation compared to 2D implementations.}, booktitle={2016 IEEE International 3D Systems Integration Conference (3DIC)}, publisher={IEEE}, author={Widialaksono, Randy and Basu Roy Chowdhury, Rangeen and Zhang, Zhenqian and Schabel, Joshua and Lipa, Steve and Rotenberg, Eric and Rhett Davis, W. and Franzon, Paul}, year={2016}, month={Nov} } @article{sheikh_tuck_rotenberg_2015, title={Control-Flow Decoupling: An Approach for Timely, Non-Speculative Branching}, volume={64}, ISSN={["1557-9956"]}, DOI={10.1109/tc.2014.2361526}, abstractNote={Mobile and PC/server class processor companies continue to roll out flagship core microarchitectures that are faster than their predecessors. Meanwhile placing more cores on a chip coupled with constant supply voltage puts per-core energy consumption at a premium. Hence, the challenge is to find future microarchitecture optimizations that not only increase performance but also conserve energy. Eliminating branch mispredictions-which waste both time and energy-is valuable in this respect. In this paper, we explore the control-flow landscape by characterizing mispredictions in four benchmark suites. We find that a third of mispredictions-per-1K-instructions (MPKI) come from what we call separable branches: branches with large control-dependent regions (not suitable for if-conversion), whose backward slices do not depend on their control-dependent instructions or have only a short dependence. We propose control-flow decoupling (CFD) to eradicate mispredictions of separable branches. The idea is to separate the loop containing the branch into two loops: the first contains only the branch's predicate computation and the second contains the branch and its control-dependent instructions. The first loop communicates branch outcomes to the second loop through an architectural queue. Microarchitecturally, the queue resides in the fetch unit to drive timely, non-speculative branching. On a microarchitecture configured similar to Intel's Sandy Bridge core, CFD increases performance by up to 55 percent, and reduces energy consumption by up to 49 percent (for CFD regions). Moreover, for some applications, CFD is a necessary catalyst for future complexity-effective large-window architectures to tolerate memory latency.}, number={8}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Sheikh, Rami and Tuck, James and Rotenberg, Eric}, year={2015}, month={Aug}, pages={2182–2203} } @inproceedings{nakabayashi_sugiyama_sasaki_rotenberg_kondo_2014, title={Co-simulation framework for streamlining microprocessor development on standard ASIC design flow}, DOI={10.1109/aspdac.2014.6742924}, abstractNote={In this paper, we present a practical processor co-simulation framework for not only RTL simulation but also gate/transistor level simulation, and even chip evaluation with an LSI tester. Our framework includes an off-chip system call emulation mechanism, which handles system calls to evaluate and verify the processor design with general benchmark programs without pseudo-circuits in the processor design. Therefore, our framework can be consistently used from RTL design to chip fabrication. We also propose a checkpoint mechanism that resumes a program from a pre-created checkpoint. This mechanism is not affected by the non-deterministic problem on a multi-core processor. Moreover, we propose a cache warming mechanism when resuming from a checkpoint.}, booktitle={2014 19th asia and south pacific design automation conference (asp-dac)}, author={Nakabayashi, T. and Sugiyama, T. and Sasaki, T. and Rotenberg, E. and Kondo, T.}, year={2014}, pages={400–405} } @inproceedings{forbes_choudhary_dwiel_rotenberg_2014, title={Design-effort alloy: Boosting a highly tuned primary core with untuned alternate cores}, ISBN={9781479964925}, url={http://dx.doi.org/10.1109/iccd.2014.6974713}, DOI={10.1109/iccd.2014.6974713}, abstractNote={A commercial flagship superscalar core is a highly tuned machine. Designers spend significant effort to tune the register-transfer-level (RTL) model, circuits, and layout to optimize performance and power. Nonetheless, the one-size-fits-all microarchitecture still suffers from suboptimal performance and power on individual applications. A single-ISA heterogeneous multi-core, with its multiple diverse core designs, has potential to exploit application diversity. However, tuning multiple core types will incur insurmountable design effort. This paper proposes a new class of single-ISA heterogeneous multi-core processor, called design-effort alloy (DEA). Only one of the core types, called the high-effort core (HEC), is tuned using a high-effort design flow. Much less effort is spent on tuning other core types, called low-effort cores (LECs). We begin with synthesizable RTL designs of a palette of out-of-order superscalar core types. A LEC and HEC is designed for each core type: the LEC is based on design automation and the HEC is derived from its LEC counterpart, using frequency and energy scaling factors that account for RTL, circuit, and layout optimizations. The resulting HECs have more than a 2x frequency advantage with only a 1.3× increase in energy consumption compared to their corresponding LECs. From the palette of core types, we find the best 4-core-type DEA processor for 179 SPEC SimPoints (program phases). Our study yielded the following key results: 1) The DEA processor's HEC is the same core type in the best high-effort homogeneous multi-core, owing to most program phases demonstrating “average” instruction-level behavior and favoring this balanced core. 2) The DEA processor yields a speedup in BIPS3/W of 1%-87%, and a geometric-mean speedup of 25%, on 20 out of 179 SimPoints over the best high-effort homogeneous multi-core. Thus, untuned LECs operating at less than half the frequency of the HEC nonetheless accelerate program phases with “outlier” instruction-level behavior.}, booktitle={2014 IEEE 32nd International Conference on Computer Design (ICCD)}, publisher={IEEE}, author={Forbes, Elliott and Choudhary, Niket K. and Dwiel, Brandon H. and Rotenberg, Eric}, year={2014}, month={Oct} } @inproceedings{navada_choudhary_wadhavkar_rotenberg_2013, title={A Unified View of Non-monotonic Core Selection and Application Steering in Heterogeneous Chip Multiprocessors}, booktitle={Proceedings of the 22nd IEEE/ACM International Conference on Parallel Architectures and Compilation Techniques (PACT-22)}, author={Navada, S. and Choudhary, N.K. and Wadhavkar, S.V. and Rotenberg, E.}, year={2013}, month={Sep}, pages={133–144} } @inproceedings{tshibangu_franzon_rotenberg_davis_2013, title={Design of controller for L2 cache mapped in Tezzaron stacked DRAM}, DOI={10.1109/3dic.2013.6702397}, abstractNote={3DIC technology allows implementation of fast and dense memory by allowing multiple layers of DRAM to be fabricated in a single die called Die-stacking technology. This creates opportunity to explore usage of DRAM as fast last level cache by exploiting mapping of data and tag in the same bank. This Paper investigates the implementation of such a cache controller using 3-layer 256 MB Tezzaron Octopus stacked DRAM. This memory provides a fast data access through burst-4 and burst-8 mode. To avoid multiple row activation, the entire set is confined in one row of 2KB. For a 64B cache block, 32 lines of data can be obtained in one row. In this design, only two cache blocks are used for tag while 30 blocks are used for data yielding a 30-way set associative L2 cache. Given the performance of Tezzaron memory, a low hit time of approximately 20 cycles was achieved. This hit latency includes precharge and row activation delays. This access latency was used in Gem5 full-system simulator to estimate the performance compared to a standard 2D SRAM L2 cache. An average of 15% on performance is achieved on different benchmarks while providing an average 27% on energy saving.}, booktitle={2013 IEEE International 3D Systems Integration Conference (3DIC)}, author={Tshibangu, N. M. and Franzon, P. D. and Rotenberg, E. and Davis, W. R.}, year={2013}, month={Oct} } @inproceedings{priyadarshi_choudhary_dwiel_upreti_rotenberg_davis_franzon_2013, title={Hetero(2) 3d integration: A scheme for optimizing efficiency/cost of chip multiprocessors}, DOI={10.1109/isqed.2013.6523582}, abstractNote={Timing the transition of a processor design to a new technology poses a provocative tradeoff. On the one hand, transitioning as early as possible offers a significant competitive advantage, by bringing improved designs to market early. On the other hand, an aggressive strategy may prove to be unprofitable, due to the low manufacturing yield of a technology that has not had time to mature. We propose exploiting two complementary forms of heterogeneity to profitably exploit an immature technology for Chip Multiprocessors (CMP). First, 3D integration facilitates a technology alloy. The CMP is split across two dies, one fabricated in the old technology and the other in the new technology. The alloy derives benefit from the new technology while limiting cost exposure. Second, to compensate for lower efficiency of old-technology cores, we exploit application and microarchitectural heterogeneity: applications which gain less from technology scaling are scheduled on old-technology cores, moreover, these cores are retuned to optimize this class of application. For a defect density ratio of 200 between 45nm and 65nm, Hetero2 3D gives 3.6× and 1.5× higher efficiency/cost compared to 2D and 3D homogeneous implementations, respectively, with only 6.5% degradation in efficiency. We also present a sensitivity analysis by sweeping the defect density ratio. The analysis reveals the defect density break-even points, where homogeneous 2D and 3D designs in 45nm achieve the same efficiency/cost as Hetero2 3D, marking significant points in the maturing of the technology.}, booktitle={Proceedings of the fourteenth international symposium on quality electronic design (ISQED 2013)}, author={Priyadarshi, S. and Choudhary, N. and Dwiel, B. and Upreti, A. and Rotenberg, E. and Davis, R. and Franzon, P.}, year={2013}, pages={1–7} } @inproceedings{rotenberg_dwiel_forbes_zhang_widialaksono_chowdhury_tshibangu_lipa_davis_franzon_et al._2013, title={Rationale for a 3D heterogeneous multi-core processor}, ISBN={9781479929870}, url={http://dx.doi.org/10.1109/iccd.2013.6657038}, DOI={10.1109/iccd.2013.6657038}, abstractNote={Single-ISA heterogeneous multi-core processors are comprised of multiple core types that are functionally equivalent but microarchitecturally diverse. This paradigm has gained a lot of attention as a way to optimize performance and energy. As the instruction-level behavior of the currently executing program varies, it is migrated to the most efficient core type for that behavior.}, booktitle={2013 IEEE 31st International Conference on Computer Design (ICCD)}, publisher={IEEE}, author={Rotenberg, Eric and Dwiel, Brandon H. and Forbes, Elliott and Zhang, Zhenqian and Widialaksono, Randy and Chowdhury, Rangeen Basu Roy and Tshibangu, Nyunyi and Lipa, Steve and Davis, W. Rhett and Franzon, Paul and et al.}, year={2013}, month={Oct}, pages={154–168} } @inproceedings{choudhary_dwiel_rotenberg_2012, title={A physical design study of fabscalar-generated superscalar cores}, ISBN={9781467326582}, url={http://dx.doi.org/10.1109/vlsi-soc.2012.7332095}, DOI={10.1109/vlsi-soc.2012.7332095}, abstractNote={FabScalar is a recently published tool for automatically generating superscalar cores, of different pipeline widths, depths and sizes. The output of FabScalar is a synthesizable register-transfer-level (RTL) description of the desired core. While this capability makes sophisticated cores more accessible to designers and researchers, meaningful applications require reducing RTL descriptions to physical designs. This paper presents the first systematic physical design study of FabScalar-generated superscalar cores.}, booktitle={2012 IEEE/IFIP 20th International Conference on VLSI and System-on-Chip (VLSI-SoC)}, publisher={IEEE}, author={Choudhary, Niket K. and Dwiel, Brandon H. and Rotenberg, Eric}, year={2012}, month={Oct} } @article{sheikh_tuck_rotenberg_2012, title={Control-Flow Decoupling}, ISBN={["978-1-4673-4819-5"]}, ISSN={["1072-4451"]}, DOI={10.1109/micro.2012.38}, abstractNote={Mobile and PC/server class processor companies continue to roll out flagship core micro architectures that are faster than their predecessors. Meanwhile placing more cores on a chip coupled with constant supply voltage puts per-core energy consumption at a premium. Hence, the challenge is to find future micro architecture optimizations that not only increase performance but also conserve energy. Eliminating branch mispredictions -- which waste both time and energy -- is valuable in this respect. We first explore the control-flow landscape by characterizing mispredictions in four benchmark suites. We find that a third of mispredictions-per-1K-instructions (MPKI) come from what we call separable branches: branches with large control-dependent regions (not suitable for if-conversion), whose backward slices do not depend on their control-dependent instructions or have only a short dependence. We propose control-flow decoupling (CFD) to eradicate mispredictions of separable branches. The idea is to separate the loop containing the branch into two loops: the first contains only the branch's predicate computation and the second contains the branch and its control-dependent instructions. The first loop communicates branch outcomes to the second loop through an architectural queue. Micro architecturally, the queue resides in the fetch unit to drive timely, non-speculative fetching or skipping of successive dynamic instances of the control-dependent region. Either the programmer or compiler can transform a loop for CFD, and we evaluate both. On a micro architecture configured similar to Intel's Sandy Bridge core, CFD increases performance by up to 43%, and reduces energy consumption by up to 41%. Moreover, for some applications, CFD is a necessary catalyst for future complexity-effective large-window architectures to tolerate memory latency.}, journal={2012 IEEE/ACM 45TH INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE (MICRO-45)}, author={Sheikh, Rami and Tuck, James and Rotenberg, Eric}, year={2012}, pages={329–340} } @article{choudhary_wadhavkar_shah_mayukh_gandhi_dwiel_navada_najaf-abadi_rotenberg_2012, title={FABSCALAR: AUTOMATING SUPERSCALAR CORE DESIGN}, volume={32}, ISSN={["1937-4143"]}, DOI={10.1109/mm.2012.23}, abstractNote={Providing multiple superscalar core types on a chip, each tailored to different classes of instruction-level behavior, is an exciting direction for increasing processor performance and energy efficiency. Unfortunately, processor design and verification effort increases with each additional core type, limiting the microarchitectural diversity that can be practically implemented. FabScalar aims to automate superscalar core design, opening up processor design to microarchitectural diversity and its many opportunities.}, number={3}, journal={IEEE MICRO}, author={Choudhary, Niket K. and Wadhavkar, Salil V. and Shah, Tanmay A. and Mayukh, Hiran and Gandhi, Jayneel and Dwiel, Brandon H. and Navada, Sandeep and Najaf-Abadi, Hashem H. and Rotenberg, Eric}, year={2012}, pages={48–59} } @inproceedings{dwiel_choudhary_rotenberg_2012, title={FPGA modeling of diverse superscalar processors}, ISBN={9781467311465 9781467311434 9781467311458}, url={http://dx.doi.org/10.1109/ispass.2012.6189225}, DOI={10.1109/ispass.2012.6189225}, abstractNote={There is increasing interest in using Field Programmable Gate Arrays (FPGAs) as platforms for computer architecture simulation. This paper is concerned with modeling superscalar processors with FPGAs. To be transformative, the FPGA modeling framework should meet three criteria. (1) Configurable: The framework should be able to model diverse superscalar processors, like a software model. In particular, it should be possible to vary superscalar parameters such as fetch, issue, and retire widths, depths of pipeline stages, queue sizes, etc. (2) Automatic: The framework should be able to automatically and efficiently map any one of its superscalar processor configurations to the FPGA. (3) Realistic: The framework should model a modern superscalar microarchitecture in detail, ideally with prototype quality, to enable a new era and depth of microarchitecture research. A framework that meets these three criteria will enjoy the convenience of a software model, the speed of an FPGA model, and the experience of a prototype. This paper describes FPGA-Sim, a configurable, automatically FPGA-synthesizable, and register-transfer-level (RTL) model of an out-of-order superscalar processor. FPGA-Sim enables FPGA modeling of diverse superscalar processors out-of-the-box. Moreover, its direct RTL implementation yields the fidelity of a hardware prototype.}, booktitle={2012 IEEE International Symposium on Performance Analysis of Systems & Software}, publisher={IEEE}, author={Dwiel, Brandon H. and Choudhary, Niket K. and Rotenberg, Eric}, year={2012}, month={Apr} } @inproceedings{nakabayashi_sasaki_rotenberg_ohno_kondo_2012, title={Research for Transporting Alpha ISA and Adopting Multi-processor to FabScalar}, booktitle={Proceedings of the Symposium on Advanced Computing Systems and Infrastructures 2012 (SACSIS 2012)}, author={Nakabayashi, T. and Sasaki, T. and Rotenberg, E. and Ohno, K. and Kondo, T.}, year={2012}, month={May}, pages={374–381} } @article{choudhary_wadhavkar_shah_mayukh_gandhi_dwiel_navada_najaf-abadi_rotenberg_2011, title={FabScalar: Composing synthesizable RTL designs of arbitrary cores within a canonical superscalar template}, DOI={10.1145/2000064.2000067}, abstractNote={A growing body of work has compiled a strong case for the single-ISA heterogeneous multi-core paradigm. A single-ISA heterogeneous multi-core provides multiple, differently-designed superscalar core types that can streamline the execution of diverse programs and program phases. No prior research has addressed the “Achilles' heel” of this paradigm: design and verification effort is multiplied by the number of different core types. This work frames superscalar processors in a canonical form, so that it becomes feasible to quickly design many cores that differ in the three major superscalar dimensions: superscalar width, pipeline depth, and sizes of structures for extracting instruction-level parallelism (ILP). From this idea, we develop a toolset, called FabScalar, for automatically composing the synthesizable register-transfer-level (RTL) designs of arbitrary cores within a canonical superscalar template. The template defines canonical pipeline stages and interfaces among them. A Canonical Pipeline Stage Library (CPSL) provides many implementations of each canonical pipeline stage, that differ in their superscalar width and depth of sub-pipelining. An RTL generation tool uses the template and CPSL to automatically generate an overall core of desired configuration. Validation experiments are performed along three fronts to evaluate the quality of RTL designs generated by FabScalar: functional and performance (instructions-per-cycle (IPC)) validation, timing validation (cycle time), and confirmation of suitability for standard ASIC flows. With FabScalar, a chip with many different superscalar core types is conceivable.}, journal={ISCA 2011: Proceedings of the 38th annual international symposium on computer architecture}, author={Choudhary, N. K. and Wadhavkar, S. V. and Shah, T. A. and Mayukh, H. and Gandhi, J. and Dwiel, B. H. and Navada, S. and Najaf-abadi, H. H. and Rotenberg, E.}, year={2011}, pages={11–22} } @article{navada_choudhary_rotenberg_2010, title={Criticality-driven Superscalar Design Space Exploration}, ISBN={["978-1-4503-0178-7"]}, DOI={10.1145/1854273.1854308}, abstractNote={It has become increasingly difficult to perform design space exploration (DSE) of computer systems with a short turnaround time because of exploding design spaces, increasing design complexity and long-running workloads. Researchers have used classical search/optimization techniques like simulated annealing, genetic algorithms, etc., to accelerate the DSE. While these techniques are better than an exhaustive search, a substantial amount of time must still be dedicated to DSE. This is a serious bottleneck in reducing research/development time. These techniques do not perform the DSE quickly enough, primarily because they do not leverage any insight as to how the different design parameters of a computer system interact to increase or degrade performance at a design point and treat the computer system as a “black-box”.}, journal={PACT 2010: PROCEEDINGS OF THE NINETEENTH INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES}, author={Navada, Sandeep and Choudhary, Niket K. and Rotenberg, Eric}, year={2010}, pages={261–272} } @inproceedings{al-otoom_forbes_rotenberg_2010, title={EXACT: Explicit Dynamic-Branch Prediction with Active Updates}, ISBN={9781450300445}, url={http://dx.doi.org/10.1145/1787275.1787321}, DOI={10.1145/1787275.1787321}, abstractNote={Branches that depend directly or indirectly on load instructions are a leading cause of mispredictions by state-of-the-art branch predictors. For a branch of this type, there is a unique dynamic instance of the branch for each unique combination of producer-load addresses. Based on this definition, a study of mispredictions reveals two related problems: (i) Global branch history often fails to distinguish between different dynamic branches. In this case, the predictor is unable to specialize predictions for different dynamic branches, causing mispredictions if their outcomes differ. Ideally, the remedy is to predict a dynamic branch using its program counter (PC) and the addresses of its producer loads, since this context uniquely identifies the dynamic branch. We call this context the identity, or ID, of the dynamic branch. In general, producer loads are unlikely to have generated their addresses when the dynamic branch is fetched. We show that the ID of a distant retired branch in the global branch stream combined with recent global branch history, is effective context for predicting the current branch. (ii) Fixing the first problem exposes another problem. A store to an address on which a dynamic branch depends may flip its outcome when it is next encountered. With conventional passive updates, the branch suffers a misprediction before the predictor is retrained. We propose that stores to the memory addresses on which a dynamic branch depends, directly update its prediction in the predictor. This novel "active update" concept avoids mispredictions that are otherwise incurred by conventional passive training. We highlight two practical features that enable large EXACT predictors: the prediction path is scalably pipelinable by virtue of its decoupled indexing strategy, and active updates are tolerant of 100s of cycles of latency making it ideal for virtualizing this component in the general-purpose memory hierarchy. We also present a compact form of the predictor that caches only dynamic instances of a static branch that differ from its overall bias.}, booktitle={Proceedings of the 7th ACM international conference on Computing frontiers - CF '10}, publisher={ACM Press}, author={Al-Otoom, Muawya and Forbes, Elliott and Rotenberg, Eric}, year={2010}, pages={165–176} } @inproceedings{najaf-abadi_rotenberg_2009, title={Architectural Contesting}, ISBN={9781424429325}, url={http://dx.doi.org/10.1109/hpca.2009.4798254}, DOI={10.1109/hpca.2009.4798254}, abstractNote={This paper presents results showing that workload behavior tends to vary considerably at granularities of less than a thousand instructions. If it were possible to adjust the microarchitecture to suit the workload behavior at such rates, significant single-thread performance enhancement would be achievable. However, previous techniques are too sluggish to be able to effectively respond to such fine-grain change.}, booktitle={2009 IEEE 15th International Symposium on High Performance Computer Architecture}, publisher={IEEE}, author={Najaf-abadi, Hashem H. and Rotenberg, Eric}, year={2009}, month={Feb} } @article{najaf-abadi_choudhary_rotenberg_2009, title={Core-Selectability in Chip Multiprocessors}, ISBN={["978-0-7695-3771-9"]}, ISSN={["1089-795X"]}, DOI={10.1109/pact.2009.44}, abstractNote={The centralized structures necessary for the extraction of instruction-level parallelism (ILP) are consuming progressively smaller portions of the total die area of chip multiprocessors (CMP). The reason for this is that scaling these structures does not enhance general performance as much as scaling the cache and interconnect. However, the fact that these structures now consume less proportional die area opens an avenue to enhancing their performance through truly overcoming the one-size-fits-all approach to their design. This paper proposes core-selectability – incorporating differently-designed cores that can be toggled into active employment. This enables differently customized ILP-extracting structures to be at hand in the system while not dramatically adding to the interconnect complexity. The design verification effort is minimized by separating the complexity of different core designs. Moreover, contrary to alternative approaches, the performance and power efficiency of the core designs are not compromised. Evaluation results are presented that show that, even when limiting the diversity between core designs to only the sizing of microarchitectural structures, core-selectability has the potential to provide notable performance enhancement (with an average of 10%) to scalable multithreaded applications, without increased concurrency. In addition, it can provide significantly greater throughput to multiprogrammed workloads by providing the potential for the system to transform into a heterogeneous design.}, journal={18TH INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES, PROCEEDINGS}, author={Najaf-abadi, Hashem H. and Choudhary, Niket K. and Rotenberg, Eric}, year={2009}, pages={113–122} } @article{najaf-abadi_rotenberg_2009, title={The Importance of Accurate Task Arrival Characterization in the Design of Processing Cores}, ISBN={["978-1-4244-5156-2"]}, DOI={10.1109/iiswc.2009.5306795}, abstractNote={This paper studies the importance of accounting for a neglected facet of overall workload behavior, the pattern of task arrival. A stochastic characterization is formulated that defines regularity in the task arrival pattern. This characterization is used as the basis for a quantitative evaluation of the importance of accurately accounting for the task arrival behavior in the design of the processing cores of a Chip Multi-processor (CMP).}, journal={PROCEEDINGS OF THE 2009 IEEE INTERNATIONAL SYMPOSIUM ON WORKLOAD CHARACTERIZATION}, author={Najaf-abadi, Hashem H. and Rotenberg, Eric}, year={2009}, pages={75–85} } @inproceedings{najaf-abadi_rotenberg_2008, title={Configurational Workload Characterization}, ISBN={9781424422326 9781424422333}, url={http://dx.doi.org/10.1109/ispass.2008.4510747}, DOI={10.1109/ispass.2008.4510747}, abstractNote={Although the best processor design for executing a specific workload does depend on the characteristics of the workload, it can not be determined without factoring-in the effect of the interdependencies between different architectural subcomponents. Consequently, workload characteristics alone do not provide accurate indication of which workloads can perform close-to-optimal on the same architectural configuration. The primary goal of this paper is to demonstrate that, in the design of a heterogeneous CMP, reducing the set of essential benchmarks based on relative similarity in raw workload behavior may direct the design process towards options that result in sub-optimality of the ultimate design. It is shown that the design parameters of the customized processor configurations, what we refer to as the configurational characteristics, can yield a more accurate indication of the best way to partition the workload space for the cores of a heterogeneous system to be customized to. In order to automate the extraction of the configurational- characteristics of workloads, a design exploration tool based on the Simplescalar timing simulator and the CACTI modeling tool is presented. Results from this tool are used to display how a systematic methodology can be employed to determine the optimal set of core configurations for a heterogeneous CMP under different design objectives. In addition, it is shown that reducing the set of workloads based on even a single widely documented benchmark similarity (between bzip and gzip) can lead to a slowdown in the overall performance of a heterogeneous-CMP design.}, booktitle={ISPASS 2008 - IEEE International Symposium on Performance Analysis of Systems and software}, publisher={IEEE}, author={Najaf-abadi, Hashem H. and Rotenberg, Eric}, year={2008}, month={Apr} } @inproceedings{reddy_rotenberg_2008, title={Coverage of a microarchitecture-level fault check regimen in a superscalar processor}, ISBN={9781424423972}, url={http://dx.doi.org/10.1109/dsn.2008.4630065}, DOI={10.1109/dsn.2008.4630065}, abstractNote={Conventional processor fault tolerance based on time/space redundancy is robust but prohibitively expensive for commodity processors. This paper explores an unconventional approach to designing a cost-effective fault-tolerant superscalar processor. The idea is to engage a regimen of microarchitecture-level fault checks. A few simple microarchitecture-level fault checks can detect many arbitrary faults in large units, by observing microarchitecture-level behavior and anomalies in this behavior. Previously, we separately proposed checks for the fetch and decode stages, rename stage, and issue stage of a contemporary superscalar processor. While each piece hinted at the possibility of a complete regimen - for an overall fault-tolerant superscalar processor - this totality was not explored. This paper provides the culmination by building a full regimen into a superscalar processor. We show for the first time that the regimen-based approach provides substantial coverage of an entire superscalar processor. Analysis reveals vulnerable areas which should be the focus for regimen additions.}, booktitle={2008 IEEE International Conference on Dependable Systems and Networks With FTCS and DCC (DSN)}, publisher={IEEE}, author={Reddy, Vimal and Rotenberg, Eric}, year={2008} } @inproceedings{reddy_rotenberg_2007, title={Inherent Time Redundancy (ITR): Using Program Repetition for Low-Overhead Fault Tolerance}, ISBN={0769528554}, url={http://dx.doi.org/10.1109/dsn.2007.59}, DOI={10.1109/dsn.2007.59}, abstractNote={A new approach is proposed that exploits repetition inherent in programs to provide low-overhead transient fault protection in a processor. Programs repeatedly execute the same instructions within close time periods. This can be viewed as a time redundant re-execution of a program, except that inputs to these inherent time redundant (ITR) instructions vary. Nevertheless, certain microarchitectural events in the processor are independent of the input and only depend on the program instructions. Such events can be recorded and confirmed when ITR instructions repeat. In this paper, we use ITR to detect transient faults in the fetch and decode units of a processor pipeline, avoiding costly approaches like structural duplication or explicit time redundant execution.}, booktitle={37th Annual IEEE/IFIP International Conference on Dependable Systems and Networks (DSN'07)}, publisher={IEEE}, author={Reddy, Vimal and Rotenberg, Eric}, year={2007}, month={Jun} } @inproceedings{al-zawawi_reddy_rotenberg_akkary_2007, title={Transparent control independence (TCI)}, ISBN={9781595937063}, url={http://dx.doi.org/10.1145/1250662.1250717}, DOI={10.1145/1250662.1250717}, abstractNote={Superscalar architectures have been proposed that exploit control independence, reducing the performance penalty of branch mispredictions by preserving the work of future misprediction-independent instructions. The essential goal of exploiting control independence is to completely decouple future misprediction-independent instructions from deferred misprediction-dependent instructions. Current implementations fall short of this goal because they explicitly maintain program order among misprediction-independent and misprediction-dependent instructions. Explicit approaches sacrifice design efficiency and ultimately performance. We observe it is sufficient to emulate program order. Potential misprediction-dependent instructions are singled out a priori and their unchanging source values are checkpointed. These instructions and values are set aside as a "recovery program". Checkpointed source values break the data dependencies with co-mingled misprediction-independent instructions - now long since gone from the pipeline - achieving the essential decoupling objective. When the mispredicted branch resolves, recovery is achieved by fetching the self-sufficient, condensed recovery program. Recovery is effectively transparent to the pipeline, in that speculative state is not rolled back and recovery appears as a jump to code. A coarse-grain retirement substrate permits the relaxed order between the decoupled programs. Transparent control independence (TCI) yields a highly streamlined pipeline that quickly recycles resources based on conventional speculation, enabling a large window with small cycle-critical resources, and prevents many mispredictions from disrupting this large window. TCI achieves speedups as high as 64% (16% average) and 88% (22% average) for 4-issue and 8-issue pipelines, respectively, among 15 SPEC integer benchmarks. Factors that limit the performance of explicitly ordered approaches are quantified.}, booktitle={Proceedings of the 34th annual international symposium on Computer architecture - ISCA '07}, publisher={ACM Press}, author={Al-Zawawi, Ahmed S. and Reddy, Vimal K. and Rotenberg, Eric and Akkary, Haitham H.}, year={2007} } @article{venkatesan_al-zawawi_sivasubramanian_rotenberg_2007, title={ZettaRAM: A power-scalable DRAM alternative through charge-voltage decoupling}, volume={56}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2007.37}, abstractNote={ZettaRAMtrade is a nascent memory technology with roots in molecular electronics. It uses a conventional DRAM architecture except that the conventional capacitor is replaced with a new molecular capacitor. The molecular capacitor has a discrete threshold voltage, above which all molecules are charged and below which all molecules are discharged. Thus, while voltage still controls charging/discharging, the fixed charge deposited on the molecular capacitor is voltage-independent. Charge-voltage decoupling makes it possible to lower voltage from one memory generation to the next while still maintaining the minimum critical charge for reliable operation, whereas DRAM voltage scaling is constrained by charge. Voltage can be scaled inexpensively and reliably by engineering new, more favorable molecules. We analyze how three key molecule parameters influence voltage and then evaluate 23 molecules in the literature. Matching DRAM density and speed, the best molecule yields 61 percent energy savings. While the fixed charge is voltage-independent, speed is voltage-dependent. Thus, voltage is padded for competitive latency. We propose dynamically modulating the padding based on criticality of memory requests, further extending ZettaRAM's energy advantage with negligible system slowdown. Architectural management extends the best molecule's energy savings to 77 percent and extracts energy savings from six otherwise uncompetitive molecules}, number={2}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Venkatesan, Ravi K. and Al-Zawawi, Ahmed S. and Sivasubramanian, Krishnan and Rotenberg, Eric}, year={2007}, month={Feb}, pages={147–160} } @inproceedings{reddy_al-zawawi_rotenberg_2006, title={Assertion-Based Microarchitecture Design for Improved Fault Tolerance}, ISBN={9780780397064 9780780397071}, ISSN={1063-6404}, url={http://dx.doi.org/10.1109/iccd.2006.4380842}, DOI={10.1109/iccd.2006.4380842}, abstractNote={Protection against transient faults is an important constraint in high-performance processor design. One strategy for achieving efficient reliability is to apply targeted fault checking/masking techniques to different units within an overall reliability regimen. In this spirit, we propose a novel class of targeted fault checks that verify the functioning of the microarchitecture itself, as opposed to the broader challenge of verifying overall architectural correctness of a running program. That is, the checks focus on verifying the mechanics of executing the program. Long term, discriminating between machinery and state may lead to highly efficient reliability solutions with high coverage. The key idea is to identify and exploit opportunities to assert microarchitectural "truths". We explore two examples, Register Name Authentication (RNA) for the rename unit and Timestamp-Based Assertion Checking (TAC) for the issue unit of a contemporary out-of-order superscalar processor. Thousands of fault injection experiments show that RNA and TAC microarchitectural assertions detect most unmasked faults for which they are designed.}, booktitle={2006 International Conference on Computer Design}, publisher={IEEE}, author={Reddy, Vimal K. and Al-Zawawi, Ahmed S. and Rotenberg, Eric}, year={2006}, month={Oct} } @article{seth_anantaraman_mueller_rotenberg_2006, title={FAST: Frequency-Aware Static Timing Analysis}, volume={5}, number={1}, journal={ACM Transactions on Programming Languages and Systems}, author={Seth, K. and Anantaraman, A. and Mueller, F. and Rotenberg, E.}, year={2006}, pages={200–224} } @article{anantaraman_rotenberg_2006, title={Non-uniform program analysis & repeatable execution constraints: Exploiting out-of-order processors in real-time systems}, volume={3}, DOI={10.1145/1279711.1279716}, abstractNote={The objective of this paper is to enable easy, tight, and safe timing analysis of contemporary complex processors. We exploit the fact that out-of-order processors can be analyzed via simulation in the absence of variable control-flow. In our first technique, Non-Uniform Program Analysis (NUPA), program segments with a single flow of control are analyzed on a complex pipeline via simulation and segments with multiple flows of control are analyzed on a simple pipeline via conventional static analysis. A reconfigurable pipeline with dual complex/simple modes mirrors the hybrid analysis. Our second technique, Repeatable Execution Constraints for out-of-ORDER (RECORDER), defines constraints that guarantee a single input-independent execution time on an out-of-order pipeline for program segments with multiple flows of control. Thus, execution time can be derived via simulation with arbitrary inputs.}, number={1}, journal={SIGBED Review}, author={Anantaraman, A. and Rotenberg, E.}, year={2006} } @article{venkatesan_herr_rotenberg_2006, title={Retention-aware placement in DRAM (RAPID): Software methods for quasi-non-volatile DRAM}, ISBN={["0-7803-9368-6"]}, ISSN={["1530-0897"]}, DOI={10.1109/hpca.2006.1598122}, abstractNote={Measurements of an off-the-shelf DRAM chip confirm that different cells retain information for different amounts of time. This result extends to DRAM rows, or pages (retention time of a page is defined as the shortest retention time among its constituent cells). Currently, a single worst-case refresh period is selected based on the page with the shortest retention time. Even with refresh optimized for room temperature, the worst page limits the safe refresh period to no longer than 500 ms. Yet, 99% and 85% of pages have retention times above 3 seconds and 10 seconds, respectively. We propose retention-aware placement in DRAM (RAPID), novel software approaches that can exploit off-the-shelf DRAMs to reduce refresh power to vanishingly small levels approaching non-volatile memory. The key idea is to favor longer-retention pages over shorter-retention pages when allocating DRAM pages. This allows selecting a single refresh period that depends on the shortest-retention page among populated pages, instead of the shortest-retention page overall. We explore three versions of RAPID and observe refresh energy savings of 83%, 93%, and 95%, relative to the best temperature-compensated refresh. RAPID with off-the-shelf DRAM also approaches the energy levels of idealized techniques that require custom DRAM support.}, journal={TWELFTH INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, PROCEEDINGS}, author={Venkatesan, Ravi K. and Herr, Stephen and Rotenberg, Eric}, year={2006}, pages={157-+} } @misc{rotenberg_venkatesan_al-zawawi_2006, title={Systems, methods and devices for providing variable-latency write operations in memory devices}, volume={7,099,215}, publisher={Washington, DC: U.S. Patent and Trademark Office}, author={Rotenberg, E. and Venkatesan, R. K. and Al-Zawawi, A. S.}, year={2006} } @inproceedings{rotenberg_venkatesan_2006, title={The State of ZettaRAM}, ISBN={1424403901 142440391X}, url={http://dx.doi.org/10.1109/nanonet.2006.346220}, DOI={10.1109/nanonet.2006.346220}, abstractNote={ZettaRAM is a nascent memory technology with roots in molecular electronics. ZettaRAM patents and papers are distilled and consolidated into a unified discussion. Various embodiments and key novel properties are discussed with a bias toward computer architecture and system design implications. Embodiments include transistor-free crossbar arrays and two hybrid molecule/silicon implementations, a flash-like cell and a 1T-1C DRAM cell. Key properties of the core technology include (1) flexibility and precision through molecular engineering, (2) self-assembly, (3) scalability through charge-voltage decoupling, (4) speed/energy tradeoff, (5) multiple discrete states, and (6) mixed molecules. Implications include inexpensive fabrication of high performance memory (by all metrics), practical mixed logic/DRAM, 3D memory, exceeding DRAM power scaling limits, intelligent power management, efficient multi-bit storage, memory hierarchies cohabiting the same space, and multiple virtual products in one physical product. Thus, molecular memory has qualities of a disruptive technology. Computer architects and system designers should play a central role in charting its use}, booktitle={2006 1st International Conference on Nano-Networks and Workshops}, publisher={IEEE}, author={Rotenberg, Eric and Venkatesan, Ravi K.}, year={2006}, month={Sep} } @inproceedings{reddy_rotenberg_parthasarathy_2006, title={Understanding prediction-based partial redundant threading for low-overhead, high- coverage fault tolerance}, ISBN={1595934510}, url={http://dx.doi.org/10.1145/1168857.1168869}, DOI={10.1145/1168857.1168869}, abstractNote={Redundant threading architectures duplicate all instructions to detect and possibly recover from transient faults. Several lighter weight Partial Redundant Threading (PRT) architectures have been proposed recently. (i) Opportunistic Fault Tolerance duplicates instructions only during periods of poor single-thread performance. (ii) ReStore does not explicitly duplicate instructions and instead exploits mispredictions among highly confident branch predictions as symptoms of faults. (iii) Slipstream creates a reduced alternate thread by replacing many instructions with highly confident predictions. We explore PRT as a possible direction for achieving the fault tolerance of full duplication with the performance of single-thread execution. Opportunistic and ReStore yield partial coverage since they are restricted to using only partial duplication or only confident predictions, respectively. Previous analysis of Slipstream fault tolerance was cursory and concluded that only duplicated instructions are covered. In this paper, we attempt to better understand Slipstream's fault tolerance, conjecturing that the mixture of partial duplication and confident predictions actually closely approximates the coverage of full duplication. A thorough dissection of prediction scenarios confirms that faults in nearly 100% of instructions are detectable. Fewer than 0.1% of faulty instructions are not detectable due to coincident faults and mispredictions. Next we show that the current recovery implementation fails to leverage excellent detection capability, since recovery sometimes initiates belatedly, after already retiring a detected faulty instruction. We propose and evaluate a suite of simple microarchitectural alterations to recovery and checking. Using the best alterations, Slipstream can recover from faults in 99% of instructions, compared to only 78% of instructions without alterations. Both results are much higher than predicted by past research, which claims coverage for only duplicated instructions, or 65% of instructions. On an 8-issue SMT processor, Slipstream performs within 1.3% of single-thread execution whereas full duplication slows performance by 14%.A key byproduct of this paper is a novel analysis framework in which every dynamic instruction is considered to be hypothetically faulty, thus not requiring explicit fault injection. Fault coverage is measured in terms of the fraction of candidate faulty instructions that are directly or indirectly detectable before.}, booktitle={Proceedings of the 12th international conference on Architectural support for programming languages and operating systems - ASPLOS-XII}, publisher={ACM Press}, author={Reddy, Vimal K. and Rotenberg, Eric and Parthasarathy, Sailashri}, year={2006} } @inbook{rotenberg_anantaraman_2005, title={Architecture of embedded microprocessors}, ISBN={012385251X}, DOI={10.1016/b978-012385251-9/50018-9}, abstractNote={This chapter focuses on the architecture of microprocessor units (MPUs) used in systems-on-chips (SoCs) and embedded systems. It reviews the reasons for the parallel evolution of embedded and desktop processors and reasons for dual tracks targeting open versus closed embedded systems—these systems constrain microarchitectural evolution due to the need for timing predictability. The chapter also describes the recent research aimed at bridging the dual tracks. SoC designs are powered by one or more general-purpose MPUs, digital signal processors (DSPs), and fixed-function coprocessors. Embedded processors are general purpose in a different sense than the high–performance processors used in personal computers. A personal computer is expected to run arbitrary software—productivity tools, computer-aided design (CAD), games, multimedia, and the operating systems (OS). In contrast, a closed embedded system runs a fixed set of tasks or task-set. The difference between embedded and high–performance processors lies in their stages of evolution. Contemporary embedded processors lag some 10 years behind their high–performance counterparts in terms of complexity. Whereas high–performance processor designs push and exceed the limits of technology, minimal embedded processor designs fully exploit the power and cost scaling advantages of new generations of CMOS technology.}, booktitle={Multiprocessor systems on chips}, publisher={San Francisco, CA: Morgan Kaufmann; Oxford: Elsevier Science}, author={Rotenberg, E. and Anantaraman, A.}, editor={Wolf, W. and Jerraya, A.Editors}, year={2005}, pages={81–112} } @article{venkatesan_al-zawawi_rotenberg_2005, title={Tapping ZettaRAM (TM) for low-power memory systems}, ISBN={["0-7695-2275-0"]}, ISSN={["1530-0897"]}, DOI={10.1109/hpca.2005.35}, abstractNote={ZettaRAM/spl trade/ is a new memory technology under development by ZettaCore/spl trade/ as a potential replacement for conventional DRAM. The key innovation is replacing the conventional capacitor in each DRAM cell with "charge-storage" molecules - a molecular capacitor. We look beyond ZettaRAM's manufacturing benefits, and approach it from an architectural viewpoint to discover benefits within the domain of architectural metrics. The molecular capacitor is unusual because the amount of charge deposited (critical for reliable sensing) is independent of write voltage, i.e., there is a discrete threshold voltage above/below which the device is fully charged/discharged. Decoupling charge from voltage enables manipulation via arbitrarily small bitline swings, saving energy. However, while charge is voltage-independent, speed is voltage-dependent. Operating too close to the threshold causes molecules to overtake peripheral circuitry as the overall performance limiter. Nonetheless, ZettaRAM offers a speed/energy trade-off whereas DRAM is inflexible, introducing new dimensions for architectural management of memory. We apply architectural insights to tap the full extent of ZettaRAM's power savings without compromising performance. Several factors converge nicely to direct focus on L2 writebacks: (i) they account for 80% of row buffer misses in the main memory, thus most of the energy savings potential, and (ii) they do not directly stall the processor and thereby offer scheduling flexibility for tolerating extended molecule latency. Accordingly, slow writes (low energy) are applied to non-critical writebacks and fast writes (high energy) to critical fetches. The hybrid write policy is combined with two options for tolerating delayed writebacks: large buffers with access reordering or L2-cache eager writebacks. Eager writebacks are remarkably synergistic with ZettaRAM: initiating writebacks early in the L2 cache compensates for delaying them at the memory controller. Dual-speed writes coupled with eager writebacks yields energy savings of 34% (out of 41% with uniformly slow writes), with less than 1% performance degradation.}, journal={11TH INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, PROCEEDINGS}, author={Venkatesan, RK and Al-Zawawi, AS and Rotenberg, E}, year={2005}, pages={83–94} } @inbook{rotenberg_2005, title={Trace caches}, ISBN={1584884479}, DOI={10.1201/9781420035155.ch4}, booktitle={Speculative execution in high performance computer architectures}, publisher={Boca Raton, FL: Chapman & Hall/CRC}, author={Rotenberg, E.}, editor={Kaeli, D. and Yew, P.-C.Editors}, year={2005} } @misc{rotenberg_lindsey_2005, title={Variable-persistence molecular memory devices and methods of operation thereof}, volume={6,944,047}, publisher={Washington, DC: U.S. Patent and Trademark Office}, author={Rotenberg, E. and Lindsey, J. S.}, year={2005} } @inproceedings{el-haj-mahmoud_al-zawawi_anantaraman_rotenberg_2005, title={Virtual multiprocessor: An analyzable, high-performance microarchitecture for real-time computing}, ISBN={159593149X}, DOI={10.1145/1086297.1086326}, abstractNote={The design of a real-time architecture is governed by a trade-off between analyzability necessary for real-time formalism and performance demanded by high-end embedded systems. We reconcile this trade-off with a novel Real-time Virtual Multiprocessor (RVMP). RVMP virtualizes a single in-order superscalar processor into multiple interference-free different-sized virtual processors. This provides a flexible spatial dimension. In the time dimension, the number and size of virtual processors can be rapidly reconfigured. A simple real-time scheduling approach concentrates scheduling within a small time interval, producing a simple repeating space/time schedule that orchestrates virtualization. RVMP successfully combines the analyzability (hence real-time formalism) of multiple processors with the flexibility (hence high performance) of simultaneous multithreading (SMT).Worst-case schedulability experiments show that more task-sets are provably schedulable on RVMP than on conventional rigid multiprocessors with equal aggregate resources, and the advantage only intensifies with more demanding task-sets. Run-time experiments show RVMP's statically-controlled coarser-grain space/time configurability is as effective as unsafe SMT. Moreover, RVMP provides a real-time formalism that SMT does not currently provide.}, booktitle={CASES 2005: International Conference on Compilers, Architecture, and Synthesis for Embedded Systems, September 24-27, 2005, San Francisco, California, USA}, publisher={New York: ACM Press}, author={El-Haj-Mahmoud, A. and Al-Zawawi, A. S. and Anantaraman, A. and Rotenberg, E.}, year={2005}, pages={213–224} } @article{koppanalil_rotenberg_2004, title={A simple mechanism for detecting ineffectual instructions in slipstream processors}, volume={53}, ISSN={["1557-9956"]}, DOI={10.1109/TC.2004.1268397}, abstractNote={A slipstream processor accelerates a program by speculatively removing repeatedly ineffectual instructions. Detecting the roots of ineffectual computation: unreferenced writes, nonmodifying writes, and correctly predicted branches, is straightforward. On the other hand, detecting ineffectual instructions in the backward slices of these root instructions currently requires complex back-propagation circuitry. We observe that, by logically monitoring the speculative program (instead of the original program), back-propagation can be reduced to detecting unreferenced writes. That is, once root instructions are actually removed, instructions at the next higher level in the backward slice become newly exposed unreferenced writes in the speculative program. This new algorithm, called implicit back-propagation, eliminates complex hardware and achieves an average performance improvement of 11.8 percent, only marginally lower than the 12.3 percent improvement achieved with explicit back-propagation. We further simplify the hardware component by electing not to detect ineffectual memory writes, focusing only on ineffectual register writes. A minimal implementation consisting of only a register-indexed table (similar to an architectural register file) achieves a good balance between complexity and performance (11.2 percent average performance improvement with implicit back-propagation and without detection of ineffectual memory writes).}, number={4}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Koppanalil, JJ and Rotenberg, E}, year={2004}, month={Apr}, pages={399–413} } @article{anantaraman_seth_rotenberg_mueller_2004, title={Enforcing safety of real-time schedules on contemporary processors using a virtual simple architecture (VISA)}, ISBN={["0-7695-2247-5"]}, ISSN={["1052-8725"]}, DOI={10.1109/real.2004.19}, abstractNote={Determining safe and tight upper bounds on the worst-case execution time (WCET) of hard real-time tasks running on contemporary microarchitectures is a difficult problem. Current trends in microarchitecture design have created a complexity wall: by enhancing performance through ever more complex architectural features, systems have become increasingly hard to analyze. This paper extends a framework, introduced previously as virtual simple architecture (VISA), to multitasking real-time systems. The objective of VISA is to obviate the need to statically analyze complex processors by instead shifting the burden of guaranteeing deadlines - in part - onto the hardware. The VISA framework exploits a complex processor that ordinarily operates with all of its advanced features enabled, called the complex mode, but which can also be downgraded to a simple mode by gating off the advanced features. A WCET bound is statically derived for a task assuming the simple mode. However, this abstraction is speculatively undermined at run-time by executing the task in the complex mode. The task's progress is continuously gauged to detect anomalous cases in which the complex mode underperforms, in which case the processor switches to the simple mode to explicitly enforce the overall contractual WCET. The processor typically operates in complex mode, generating significant slack, and the VISA safety mechanism ensures bounded timing in atypical cases. Extra slack can be exploited for reducing power consumption and/or enhancing functionality. By extending VISA from single-task to multi-tasking systems, this paper reveals the full extent of VISA'S powerful abstraction capability. Key missing pieces are filled in: (1) preserving integrity of the gauging mechanism despite disruptions caused by preemptions; (2) demonstrating compatibility with arbitrary scheduling and dynamic voltage scaling (DVS) policies; (3) formally describing VISA speculation overheads in terms of padding tasks' WCETs; and (4) developing a systematic method for minimizing these overheads. We also propose a VISA variant that dynamically accrues the slack needed to facilitate speculation in the complex mode, eliminating the need to statically pad WCETs and thereby enabling VISA-style speculation even in highly-utilized systems.}, journal={25TH IEEE INTERNATIONAL REAL-TIME SYSTEMS SYMPOSIUM, PROCEEDINGS}, author={Anantaraman, A and Seth, K and Rotenberg, E and Mueller, F}, year={2004}, pages={114–125} } @inproceedings{el-haj-mahmoud_rotenberg_2004, title={Safely exploiting multithreaded processors to tolerate memory latency in real-time systems}, ISBN={1581138903}, DOI={10.1145/1023833.1023837}, abstractNote={A coarse-grain multithreaded processor can effectively hide long memory latencies by quickly switching to an alternate task when the active task issues a memory request, improving overall throughput. However, dynamic switching cannot be safely exploited to improve throughput in hard-real-time embedded systems. The schedulability of a task-set (guaranteeing all tasks meet deadlines) must be determined a priori using offline schedulability tests. Any computation/memory overlap must be statically accounted for. We develop a novel analytical framework that bounds the overlap between computation of a pipeline-resident-task and on-going memory transfers of other tasks. A simple closed-form schedulability test is derived, that only depends on the aggregate computation (C) and memory (M) components of tasks. Namely, the technique does not require specificity regarding the location of memory transfers within and among tasks and avoids searching all task permutations for a specific feasible schedule. To the best of our knowledge, this is the first work to provide the necessary formalism for safely and tractably exploiting coarse-grain multithreaded processors to tolerate memory latency in hard-real-time systems, exceeding the schedulability limits of classic real-time theory for uniprocessors. Our techniques make it possible to capitalize on higher frequency embedded processors, despite the widening processor-memory speed gap. Experiments with task-sets from C-lab benchmarks reveal improvement in the schedulability of task-sets, measured as the ability to schedule previously infeasible task-sets or reduce utilization for already feasible task-sets. We also demonstrate proof-of-concept by deploying our method in a cycle-level simulator of an ARM11-like embedded microprocessor augmented with multiple register contexts, the same hardware multithreading support available in Ubicom's IP3023 embedded microprocessor.}, booktitle={CASES 2004: International Conference on Compilers, Architecture, and Synthesis for Embedded Systems, September 22-25, 2004, Washington, DC, USA}, publisher={New York: ACM Press}, author={El-Haj-Mahmoud, A. and Rotenberg, E.}, year={2004}, pages={2–13} } @article{huiyang_toburen_rotenberg_conte_2003, title={Adaptive mode control: A static-power-efficient cache design}, volume={2}, DOI={10.1145/860176.860181}, abstractNote={Lower threshold voltages in deep submicron technologies cause more leakage current, increasing static power dissipation. This trend, combined with the trend of larger/more cache memories dominating die area, has prompted circuit designers to develop SRAM cells with low-leakage operating modes (e.g., sleep mode). Sleep mode reduces static power dissipation, but data stored in a sleeping cell is unreliable or lost. So, at the architecture level, there is interest in exploiting sleep mode to reduce static power dissipation while maintaining high performance.Current approaches dynamically control the operating mode of large groups of cache lines or even individual cache lines. However, the performance monitoring mechanism that controls the percentage of sleep-mode lines, and identifies particular lines for sleep mode, is somewhat arbitrary. There is no way to know what the performance could be with all cache lines active, so arbitrary miss rate targets are set (perhaps on a per-benchmark basis using profile information), and the control mechanism tracks these targets. We propose applying sleep mode only to the data store and not the tag store. By keeping the entire tag store active the hardware knows what the hypothetical miss rate would be if all data lines were active, and the actual miss rate can be made to precisely track it. Simulations show that an average of 73% of I-cache lines and 54% of D-cache lines are put in sleep mode with an average IPC impact of only 1.7%, for 64 KB caches.}, number={3}, journal={ACM Transactions on Embedded Computing Systems}, author={Huiyang and Toburen, M. C. and Rotenberg, E. and Conte, T. M.}, year={2003}, pages={347–372} } @article{seth_anantaraman_mueller_rotenberg_2003, title={FAST: Frequency-aware static timing analysis}, ISBN={["0-7695-2044-8"]}, DOI={10.1109/real.2003.1253252}, abstractNote={Power is a valuable resource in embedded systems as the lifetime of many such systems is constrained by their battery capacity. Recent advances in processor design have added support for dynamic frequency/voltage scaling (DVS) for saving power. Recent work on real-time scheduling focuses on saving power in static as well as dynamic scheduling environments by exploiting idle and slack due to early task completion for DVS of subsequent tasks. These scheduling algorithms rely on a priori knowledge of worst-case execution times (WCET) for each task. They assume that DVS has no effect on the worst-case execution cycles (WCEC) of a task and scale the WCET according to the processor frequency. However, for systems with memory hierarchies, the WCEC typically does not change under DVS due to frequency modulation. Hence, current assumptions used by DVS schemes result in a highly exaggerated WCET. This paper contributes novel techniques for tight and flexible static timing analysis particularly well-suited for dynamic scheduling schemes. The technical contributions are as follows: (1) we assess the problem of changing execution cycles due to scaling techniques. (2) We propose a parametric approach towards bounding the WCET statically with respect to the frequency. Using a parametric model, we can capture the effect of changes in frequency on the WCEC and thus, accurately model the WCET over any frequency range. (3) We discuss design and implementation of the frequency-aware static timing analysis (FAST) tool based on our prior experience with static timing analysis. (4) We demonstrate in experiments that our FAST tool provides safe upper bounds on the WCET, which are tight. The FAST tool allows us to capture the WCET of six benchmarks using equations that overestimate the WCET by less than 1%. FAST equations can also be used to improve existing DVS scheduling schemes to ensure that the effect of frequency scaling on WCET is considered and that the WCET used is not exaggerated. (5) We leverage three DVS scheduling schemes by incorporating FAST into them and by showing that the power consumption further decreases. To the best of our knowledge, this study of DVS effects on timing analysis is unprecedented.}, journal={RTSS 2003: 24TH IEEE INTERNATIONAL REAL-TIME SYSTEMS SYMPOSIUM, PROCEEDINGS}, author={Seth, K and Anantaraman, A and Mueller, F and Rotenberg, E}, year={2003}, pages={40–51} } @article{ibrahim_byrd_rotenberg_2003, title={Slipstream execution mode for CMP-based multiprocessors}, volume={12}, ISBN={["0-7695-1871-0"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84955465003&partnerID=MN8TOARS}, DOI={10.1109/hpca.2003.1183536}, abstractNote={Scalability of applications on distributed shared-memory (DSM) multiprocessors is limited by communication overheads. At some point, using more processors to increase parallelism yields diminishing returns or even degrades performance. When increasing concurrency is futile, we propose an additional mode of execution, called slipstream mode, that instead enlists extra processors to assist parallel tasks by reducing perceived overheads. We consider DSM multiprocessors built from dual-processor chip multiprocessor (CMP) nodes with shared L2 cache. A task is allocated on one processor of each CMP node. The other processor of each node executes a reduced version of the same task. The reduced version skips shared-memory stores and synchronization, running ahead of the true task. Even with the skipped operations, the reduced task makes accurate forward progress and generates an accurate reference stream, because branches and addresses depend primarily on private data. Slipstream execution mode yields two benefits. First, the reduced task prefetches data on behalf of the true task. Second, reduced tasks provide a detailed picture of future reference behavior, enabling a number of optimizations aimed at accelerating coherence events, e.g., self-invalidation. For multiprocessor systems with up to 16 CMP nodes, slipstream mode outperforms running one or two conventional tasks per CMP in 7 out of 9 parallel scientific benchmarks. Slipstream mode is 12-19% faster with prefetching only and up to 29% faster with self-invalidation enabled.}, journal={NINTH INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, PROCEEDINGS}, publisher={IEEE Comput. Soc}, author={Ibrahim, KZ and Byrd, GT and Rotenberg, E}, year={2003}, pages={179–190} } @inproceedings{anantaraman_seth_patil_rotenberg_f. mueller_2003, title={Virtual Simple Architecture (VISA): Exceeding the complexity limit in safe real-time systems}, ISBN={1880843374}, booktitle={Computers and their applications :|bproceedings of the ISCA 16th International Conference, Seattle, Washington, USA, March 28-30, 2001}, publisher={Cary, NC: ISCA}, author={Anantaraman, A. and Seth, K. and Patil, K. and Rotenberg, E. and F. Mueller, F.}, year={2003}, pages={350–361} } @inproceedings{koppanalil_ramrakhyani_desai_vaidyanathan_rotenberg_2002, title={A case for dynamic pipeline scaling}, ISBN={1581137265}, DOI={10.1145/581630.581632}, abstractNote={Energy consumption can be reduced by scaling down frequency when peak performance is not needed. A lower frequency permits slower circuits, and hence a lower supply voltage. Energy reduc¿tion comes from voltage reduction, a technique called Dynamic Voltage Scaling (DVS).This paper makes the case that the useful frequency range of DVS is limited because there is a lower bound on voltage. Lowering fre¿quency permits voltage reduction until the lowest voltage is reached. Beyond that point, lowering frequency further does not save energy because voltage is constant.However, there is still opportunity for energy reduction outside the influence of DVS. If frequency is lowered enough, pairs of pipe¿line stages can be merged to form a shallower pipeline. The shal¿low pipeline has better instructions-per-cycle (IPC) than the deep pipeline. Since energy also depends on IPC, energy is reduced for a given frequency. Accordingly, we propose Dynamic Pipeline Scaling (DPS). A DPS-enabled deep pipeline can merge adjacent pairs of stages by making the intermediate latches transparent and disabling corresponding feedback paths. Thus, a DPS-enabled pipeline has a deep mode for higher frequencies within the influ¿ence of DVS, and a shallow mode for lower frequencies. Shallow mode extends the frequency range for which energy reduction is possible. For frequencies outside the influence of DVS, a DPS-enabled deep pipeline consumes from 23% to 40% less energy than a rigid deep pipeline.}, booktitle={Proceedings of the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems|h: 2002, Greenoble, France, October 08-11, 2002}, publisher={New York: ACM Press}, author={Koppanalil, J. and Ramrakhyani, P. and Desai, S. and Vaidyanathan, A. and Rotenberg, E.}, year={2002}, pages={1–8} } @inproceedings{lebeck_koppanalil_li_patwardhan_rotenberg_2002, title={A large, fast instruction window for tolerating cache misses}, ISBN={076951605X}, DOI={10.1109/isca.2002.1003562}, abstractNote={Instruction window size is an important design parameter for many modern processors. This paper presents a new instruction window design targeted at achieving the latency tolerance of large windows with the clock cycle time of small windows. The key observation is that instructions dependent on a long latency operation (e.g., cache miss) cannot execute until that source operation completes. These instructions are moved out of the conventional, small, issue queue to a much larger waiting instruction buffer (WIB). When the long latency operation completes, the instructions are reinserted into the issue queue. In this paper, we focus specifically on load cache misses and their dependent instructions. Simulations reveal that, for an 8-way processor, a 2K-entry WIB with a 32-entry issue queue can achieve speedups of 20%, 84%, and 50% over a conventional 32-entry issue queue for a subset of the SPEC CINT2000, SPEC CFP2000, and Olden benchmarks, respectively.}, booktitle={29th Annual International Symposium on Computer Architecture: Proceedings : 25-29 May, 2002, Anchorage, Alaska}, publisher={Los Alamitos, CA: IEEE Computer Society}, author={Lebeck, A. R. and Koppanalil, J. J. and Li, T. and Patwardhan, J. and Rotenberg, E.}, year={2002}, pages={59–70} } @inproceedings{huiyang_toburen_rotenberg_conte_2001, title={Adaptive mode control: A static-power-efficient cache design}, ISBN={0769513638}, DOI={10.1109/pact.2001.953288}, abstractNote={Lower threshold voltages in deep sub-micron technologies cause store leakage current, increasing static power dissipation. This trend, combined with the trend of larger/more cache memories dominating die area, has prompted circuit designers to develop SRAM cells with low-leakage operating modes (e.g., sleep mode). Sleep mode reduces static power dissipation but data stored in a sleeping cell is unreliable or lost. So, at the architecture level, there is interest in exploiting sleep mode to reduce static power dissipation while maintaining high performance. Current approaches dynamically control the operating mode of large groups of cache lines or even individual cache lines. However, the performance monitoring mechanism that controls the percentage of sleep-mode lines, and identifies particular lines for sleep mode, is somewhat arbitrary. There is no way to know what the performance could be with all cache lines active, so arbitrary miss rate targets are set (perhaps on a per-benchmark basis using profile information) and the control mechanism tracks these targets. We propose applying sleep mode only to the data store and not the tag store. By keeping the entire tag store active, the hardware knows what the hypothetical miss rate would be if all data lines were active and the actual miss rate can be made to precisely track it. Simulations show an average of 73% of I-cache lines and 54% of D-cache lines are put in sleep mode with an average IPC impact of only 1.7%, for 64KB caches.}, booktitle={2001 International Conference on Parallel Architectures and Compilation Techniques: Proceedings: 8-12 September, 2001, Barcelona, Catalunya, Spain}, publisher={Los Alamitos, CA: IEEE Computer Society}, author={Huiyang and Toburen, M. C. and Rotenberg, E. and Conte, T. M.}, year={2001}, pages={61–70} } @inbook{rotenberg_2001, title={Trace caching and trace processors}, ISBN={0849308852}, booktitle={Computer engineering handbook}, publisher={Boca Raton, FL: CRC Press}, author={Rotenberg, E.}, year={2001}, pages={8–45} } @article{rotenberg_2001, title={Using variable-MHz microprocessors to efficiently handle uncertainty in real-time systems}, ISBN={["0-7695-1369-7"]}, ISSN={["1072-4451"]}, DOI={10.1109/micro.2001.991103}, abstractNote={Guaranteed performance is critical in real-time systems because correct operation requires tasks complete on time. Meanwhile, as software complexity increases and deadlines tighten, embedded processors inherit high-performance techniques such as pipelining, caches, and branch prediction. Guaranteeing the performance of complex pipelines is difficult and worst-case analysis often under-estimates the microarchitecture for correctness. Ultimately, the designer must turn to clock frequency as a reliable source of performance. The chosen processor has a higher frequency than is needed most of the time, to compensate for uncertain hardware enhancements-partly defeating their intended purpose. We propose using microarchitecture simulation to produce accurate but not guaranteed-correct worst-case performance bounds. The primary clock frequency is chosen based on simulated-worst-case performance. Since static analysis cannot confirm simulated-worst-case bounds, the microarchitecture is also backed up by clock frequency reserves. When running a task, the processor periodically checks for interim microarchitecture performance failures. These are expected to be rare, but frequency reserves are available to guarantee the final deadline is met in spite of interim failures. Experiments demonstrate significant frequency reductions, e.g., -100 MHz for a peak 300 MHz processor. The more conservative worst-case analysis is, the larger the frequency reduction. The shorter the deadline, the larger the frequency reduction. And reserve frequency is generally no worse than the high frequency produced by conventional worst-case analysis, i.e., the system degrades gracefully in the presence of transient performance faults.}, journal={34TH ACM/IEEE INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE, MICRO-34, PROCEEDINGS}, author={Rotenberg, E}, year={2001}, pages={28–39} } @inproceedings{purser_sundaramoorthy_rotenberg_2000, title={A study of slipstream processors}, ISBN={076950924X}, DOI={10.1145/360128.360155}, abstractNote={Article Free Access Share on A study of slipstream processors Authors: Zach Purser North Carolina State University, Department of Electrical and Computer Engineering, Engineering Graduate Research Center, Campus Box 7914, Raleigh, NC North Carolina State University, Department of Electrical and Computer Engineering, Engineering Graduate Research Center, Campus Box 7914, Raleigh, NCView Profile , Karthik Sundaramoorthy North Carolina State University, Department of Electrical and Computer Engineering, Engineering Graduate Research Center, Campus Box 7914, Raleigh, NC North Carolina State University, Department of Electrical and Computer Engineering, Engineering Graduate Research Center, Campus Box 7914, Raleigh, NCView Profile , Eric Rotenberg North Carolina State University, Department of Electrical and Computer Engineering, Engineering Graduate Research Center, Campus Box 7914, Raleigh, NC North Carolina State University, Department of Electrical and Computer Engineering, Engineering Graduate Research Center, Campus Box 7914, Raleigh, NCView Profile Authors Info & Claims MICRO 33: Proceedings of the 33rd annual ACM/IEEE international symposium on MicroarchitectureDecember 2000 Pages 269–280https://doi.org/10.1145/360128.360155Published:01 December 2000Publication History 53citation486DownloadsMetricsTotal Citations53Total Downloads486Last 12 Months42Last 6 weeks1 Get Citation AlertsNew Citation Alert added!This alert has been successfully added and will be sent to:You will be notified whenever a record that you have chosen has been cited.To manage your alert preferences, click on the button below.Manage my AlertsNew Citation Alert!Please log in to your account Save to BinderSave to BinderCreate a New BinderNameCancelCreateExport CitationPublisher SiteeReaderPDF}, booktitle={Proceedings: 33rd Annual IEEE/ACM International Symposium on Microarchitecture: Monterey, California, USA, 10-13 December 2000}, publisher={Los Alamitos, CA: IEEE Computer Society}, author={Purser, Z. and Sundaramoorthy, K. and Rotenberg, E.}, year={2000}, pages={269–280} } @article{rotenberg_smith_2000, title={Control independence in trace processors}, volume={2}, journal={Journal of Instruction-level Parallelism}, author={Rotenberg, E. and Smith, J. E.}, year={2000}, pages={63–85} } @inproceedings{sundaramoorthy_purser_rotenberg_2000, title={Slipstream processors: Improving both performance and fault tolerance}, ISBN={1581133170}, DOI={10.1145/378993.379247}, abstractNote={Processors execute the full dynamic instruction stream to arrive at the final output of a program, yet there exist shorter instruction streams that produce the same overall effect. We propose creating a shorter but otherwise equivalent version of the original program by removing ineffectual computation and computation related to highly-predictable control flow. The shortened program is run concurrently with the full program on a chip multiprocessor simultaneous multithreaded processor, with two key advantages:1) Improved single-program performance. The shorter program speculatively runs ahead of the full program and supplies the full program with control and data flow outcomes. The full program executes efficiently due to the communicated outcomes, at the same time validating the speculative, shorter program. The two programs combined run faster than the original program alone. Detailed simulations of an example implementation show an average improvement of 7% for the SPEC95 integer benchmarks.2) Fault tolerance. The shorter program is a subset of the full program and this partial-redundancy is transparently leveraged for detecting and recovering from transient hardware faults.}, booktitle={ASPLOS-IX proceedings: Ninth International Conference on Architectural Support for Programming Languages and Operating Systems, Cambridge, Massachusetts, November 12-15, 2000}, publisher={New York: ACM Press}, author={Sundaramoorthy, K. and Purser, Z. and Rotenberg, E.}, year={2000}, pages={257–268} } @article{rotenberg_jacobson_smith_1999, title={A study of control independence in superscalar processors}, ISBN={["0-7695-0004-8"]}, DOI={10.1109/hpca.1999.744346}, abstractNote={Control independence has been put forward as a significant new source of instruction level parallelism for future generation processors. However, its performance potential under practical hardware constraints is not known, and even less is understood about the factors that contribute to or limit the performance of control independence. Important aspects of control independence are identified and singled out for study, and a series of idealized machine models are used to isolate and evaluate these aspects. It is shown that much of the performance potential of control independence is lost due to data dependences and wasted resources consumed by incorrect control dependent instructions. Even so, control independence can close the performance gap between real and perfect branch prediction by as much as half. Next, important implementation issues are discussed and some design alternatives are given. This is followed by a more detailed set of simulations, where the key implementation features are realistically modeled. These simulations show typical performance improvements of 10-30%.}, journal={FIFTH INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, PROCEEDINGS}, author={Rotenberg, E and Jacobson, Q and Smith, J}, year={1999}, pages={115–124} } @article{rotenberg_bennett_smith_1999, title={A trace cache microarchitecture and evaluation}, volume={48}, ISSN={["1557-9956"]}, DOI={10.1109/12.752652}, abstractNote={As the instruction issue width of superscalar processors increases, instruction fetch bandwidth requirements will also increase. It will eventually become necessary to fetch multiple basic blocks per clock cycle. Conventional instruction caches hinder this effort because long instruction sequences are not always in contiguous cache locations. Trace caches overcome this limitation by caching traces of the dynamic instruction stream, so instructions that are otherwise noncontiguous appear contiguous. In this paper, we present and evaluate a microarchitecture incorporating a trace cache. The microarchitecture provides high instruction fetch bandwidth with low latency by explicitly sequencing through the program at the higher level of traces, both in terms of (1) control flow prediction acid (2) instruction supply. For the SPEC95 integer benchmarks, trace-level sequencing improves performance from 15 percent to 35 percent over an otherwise equally sophisticated, but contiguous, multiple-block fetch mechanism. Most of this performance improvement is due to the trace cache. However, for one benchmark whose performance is limited by branch mispredictions, the performance gain is almost entirely due to improved prediction accuracy.}, number={2}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Rotenberg, E and Bennett, S and Smith, JE}, year={1999}, month={Feb}, pages={111–120} } @inproceedings{rotenberg_1999, title={AR-SMT: A microarchitectural approach to fault tolerance in microprocessors}, ISBN={0780357639}, DOI={10.1109/ftcs.1999.781037}, abstractNote={This paper speculates that technology trends pose new challenges for fault tolerance in microprocessors. Specifically, severely reduced design tolerances implied by gigaherz clock rates may result in frequent and arbitrary transient faults. We suggest that existing fault-tolerant techniques-system-level, gate-level, or component-specific approaches-are either too costly for general purpose computing, overly intrusive to the design, or insufficient for covering arbitrary logic faults. An approach in which the microarchitecture itself provides fault tolerance is required. We propose a new time redundancy fault-tolerant approach in which a program is duplicated and the two redundant programs simultaneously run on the processor: The technique exploits several significant microarchitectural trends to provide broad coverage of transient faults and restricted coverage of permanent faults. These trends are simultaneous multithreading, control flow and data flow prediction, and hierarchical processors-all of which are intended for higher performance, but which can be easily leveraged for the specified fault tolerance goals. The overhead for achieving fault tolerance is low, both in terms of performance and changes to the existing microarchitecture. Detailed simulations of five of the SPEC95 benchmarks show that executing two redundant programs on the fault-tolerant microarchitecture takes only 10% to 30% longer than running a single version of the program.}, booktitle={Digest of papers: Twenty-Ninth Annual International Symposium on Fault-Tolerant Computing: June 15-18, 1999, Madison, Wisconsin, USA}, publisher={Los Alamitos, CA: IEEE Computer Society}, author={Rotenberg, E.}, year={1999}, pages={84–91} } @article{rotenberg_smith_1999, title={Control independence in trace processors}, ISBN={["0-7695-0437-X"]}, ISSN={["1072-4451"]}, DOI={10.1109/micro.1999.809438}, abstractNote={Branch mispredictions are a major obstacle to exploiting instruction-level parallelism, at least in part because all instructions after a mispredicted branch are squashed. However, instructions that are control independent of the branch must be fetched regardless of the branch outcome, and do not necessarily have to be squashed and re-executed. Control independence exists when the two paths following a branch re-converge. A trace processor microarchitecture is developed to exploit control independence and thereby reduce branch misprediction penalties. There are three major contributions: 1) Trace-level re-convergence is not guaranteed despite re-convergence at the instruction-level. Novel trace selection techniques are developed to expose control independence at the trace-level. 2) Control independence's potential complexity stems from insertion and removal of instructions from the middle of the instruction window. Trace processors manage control flow hierarchically (traces are the fundamental unit of control flow) and this results in an efficient implementation. 3) Control independent instructions must be inspected for incorrect data dependences caused by mispredicted control flow. Existing data speculation support is easily leveraged to selectively re-execute incorrect-data dependent, control independent instructions. For five of the SPEC95 integer benchmarks, control independence improves trace processor performance from 5% to 25%, and 17% on average.}, journal={32ND ANNUAL INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE, (MICRO-32), PROCEEDINGS}, author={Rotenberg, E and Smith, J}, year={1999}, pages={4–15} } @article{jacobson_rotenberg_smith_1997, title={Path-based next trace prediction}, ISBN={["0-8186-7977-8"]}, ISSN={["1072-4451"]}, DOI={10.1109/micro.1997.645793}, abstractNote={The trace cache is proposed as a mechanism for providing increased fetch bandwidth by allowing the processor to fetch across multiple branches in a single cycle. But to date predicting multiple branches per cycle has meant paying a penalty in prediction accuracy. We propose a next trace predictor that treats the traces as basic units and explicitly predicts sequences of traces. The predictor collects histories of trace sequences (paths) and makes predictions based on these histories. The basic predictor is enhanced to a hybrid configuration that reduces performance losses due to cold starts and aliasing in the prediction table. The Return History Stack is introduced to increase predictor performance by saving path history information across procedure call/returns. Overall, the predictor yields about a 26% reduction in misprediction rates when compared with the most aggressive previously proposed, multiple branch prediction methods.}, journal={THIRTIETH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE, PROCEEDINGS}, author={Jacobson, Q and Rotenberg, E and Smith, JE}, year={1997}, pages={14–23} } @article{rotenberg_jacobson_sazeides_smith_1997, title={Trace processors}, ISBN={["0-8186-7977-8"]}, ISSN={["1072-4451"]}, DOI={10.1109/micro.1997.645805}, abstractNote={Traces are dynamic instruction sequences constructed and cached by hardware. A microarchitecture organized around traces is presented as a means for efficiently executing many instructions per cycle. Trace processors exploit both control flow and data flow hierarchy to overcome complexity and architectural limitations of conventional superscalar processors by (1) distributing execution resources based on trace boundaries and (2) applying control and data prediction at the trace level rather than individual branches or instructions. Three sets of experiments using the SPECInt95 benchmarks are presented. (i) A detailed evaluation of trace processor configurations: the results affirm that significant instruction-level parallelism can be exploited in integer programs (2 to 6 instructions per cycle). We also isolate the impact of distributed resources, and quantify the value of successively doubling the number of distributed elements. (ii) A trace processor with data prediction applied to inter-trace dependences: potential performance improvement with perfect prediction is around 45% for all benchmarks. With realistic prediction, gcc achieves an actual improvement of 10%. (iii) Evaluation of aggressive control flow: some benchmarks benefit from control independence by as much as 10%.}, journal={THIRTIETH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE, PROCEEDINGS}, author={Rotenberg, E and Jacobson, Q and Sazeides, Y and Smith, J}, year={1997}, pages={138–148} } @article{jacobsen_rotenberg_smith_1996, title={Assigning confidence to conditional branch predictions}, ISBN={["0-8186-7641-8"]}, DOI={10.1109/micro.1996.566457}, abstractNote={Many high performance processors predict conditional branches and consume processor resources based on the prediction. In some situations, resource allocation can be better optimized if a confidence level is assigned to a branch prediction; i.e. if the quantity of resources allocated is a function of the confidence level. To support such optimizations, we consider hardware mechanisms that partition conditional branch predictions into two sets: those which are accurate a relatively high percentage of the time, and those which are accurate a relatively low percentage of the time. The objective is to concentrate as many of the mispredictions as practical into a relatively small set of low confidence dynamic branches. We first study an ideal method that profiles branch predictions and sorts static branches into high and low confidence sets, depending on the accuracy with which they are dynamically predicted. We find that about 63 percent of the mispredictions can be localized to a set of static branches that account for 20 percent of the dynamic branches. We then study idealized dynamic confidence methods using both one and two levels of branch correctness history. We find that the single level method performs at least as well as the more complex two level method and is able to isolate 89 percent of the mispredictions into a set containing 20 percent of the dynamic branches. Finally, we study practical, less expensive implementations and find that they achieve most of the performance of the idealized methods.}, journal={PROCEEDINGS OF THE 29TH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE - MICRO-29}, author={Jacobsen, E and Rotenberg, E and Smith, JE}, year={1996}, pages={142–152} } @article{rotenberg_bennett_smith_1996, title={Trace cache: A low latency approach to high bandwidth instruction fetching}, ISBN={["0-8186-7641-8"]}, DOI={10.1109/micro.1996.566447}, abstractNote={As the issue width of superscalar processors is increased, instruction fetch bandwidth requirements will also increase. It will become necessary to fetch multiple basic blocks per cycle. Conventional instruction caches hinder this effort because long instruction sequences are not always in contiguous cache locations. We propose supplementing the conventional instruction cache with a trace cache. This structure caches traces of the dynamic instruction stream, so instructions that are otherwise noncontiguous appear contiguous. For the Instruction Benchmark Suite (IBS) and SPEC92 integer benchmarks, a 4 kilobyte trace cache improves performance on average by 28% over conventional sequential fetching. Further it is shown that the trace cache's efficient, low latency approach enables it to outperform more complex mechanisms that work solely out of the instruction cache.}, journal={PROCEEDINGS OF THE 29TH ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE - MICRO-29}, author={Rotenberg, E and Bennett, S and Smith, JE}, year={1996}, pages={24–34} }