@inproceedings{priyadarshi_choudhary_dwiel_upreti_rotenberg_davis_franzon_2013, title={Hetero(2) 3d integration: A scheme for optimizing efficiency/cost of chip multiprocessors}, DOI={10.1109/isqed.2013.6523582}, abstractNote={Timing the transition of a processor design to a new technology poses a provocative tradeoff. On the one hand, transitioning as early as possible offers a significant competitive advantage, by bringing improved designs to market early. On the other hand, an aggressive strategy may prove to be unprofitable, due to the low manufacturing yield of a technology that has not had time to mature. We propose exploiting two complementary forms of heterogeneity to profitably exploit an immature technology for Chip Multiprocessors (CMP). First, 3D integration facilitates a technology alloy. The CMP is split across two dies, one fabricated in the old technology and the other in the new technology. The alloy derives benefit from the new technology while limiting cost exposure. Second, to compensate for lower efficiency of old-technology cores, we exploit application and microarchitectural heterogeneity: applications which gain less from technology scaling are scheduled on old-technology cores, moreover, these cores are retuned to optimize this class of application. For a defect density ratio of 200 between 45nm and 65nm, Hetero2 3D gives 3.6× and 1.5× higher efficiency/cost compared to 2D and 3D homogeneous implementations, respectively, with only 6.5% degradation in efficiency. We also present a sensitivity analysis by sweeping the defect density ratio. The analysis reveals the defect density break-even points, where homogeneous 2D and 3D designs in 45nm achieve the same efficiency/cost as Hetero2 3D, marking significant points in the maturing of the technology.}, booktitle={Proceedings of the fourteenth international symposium on quality electronic design (ISQED 2013)}, author={Priyadarshi, S. and Choudhary, N. and Dwiel, B. and Upreti, A. and Rotenberg, E. and Davis, R. and Franzon, P.}, year={2013}, pages={1–7} } @article{patsilaras_choudhary_tuck_2012, title={Efficiently Exploiting Memory Level Parallelism on Asymmetric Coupled Cores in the Dark Silicon Era}, volume={8}, ISSN={["1544-3566"]}, DOI={10.1145/2086696.2086707}, abstractNote={Extracting high memory-level parallelism (MLP) is essential for speeding up single-threaded applications which are memory bound. At the same time, the projected amount of dark silicon (the fraction of the chip powered off) on a chip is growing. Hence, Asymmetric Multicore Processors (AMP) offer a unique opportunity to integrate many types of cores, each powered at different times, in order to optimize for different regions of execution. In this work, we quantify the potential for exploiting core customization to speedup programs during regions of high MLP. Based on a careful design space exploration, we discover that an AMP that includes a narrow and fast specialized core has the potential to efficiently exploit MLP.}, number={4}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Patsilaras, George and Choudhary, Niket K. and Tuck, James}, year={2012}, month={Jan} } @article{choudhary_wadhavkar_shah_mayukh_gandhi_dwiel_navada_najaf-abadi_rotenberg_2012, title={FABSCALAR: AUTOMATING SUPERSCALAR CORE DESIGN}, volume={32}, ISSN={["1937-4143"]}, DOI={10.1109/mm.2012.23}, abstractNote={Providing multiple superscalar core types on a chip, each tailored to different classes of instruction-level behavior, is an exciting direction for increasing processor performance and energy efficiency. Unfortunately, processor design and verification effort increases with each additional core type, limiting the microarchitectural diversity that can be practically implemented. FabScalar aims to automate superscalar core design, opening up processor design to microarchitectural diversity and its many opportunities.}, number={3}, journal={IEEE MICRO}, author={Choudhary, Niket K. and Wadhavkar, Salil V. and Shah, Tanmay A. and Mayukh, Hiran and Gandhi, Jayneel and Dwiel, Brandon H. and Navada, Sandeep and Najaf-Abadi, Hashem H. and Rotenberg, Eric}, year={2012}, pages={48–59} } @article{choudhary_wadhavkar_shah_mayukh_gandhi_dwiel_navada_najaf-abadi_rotenberg_2011, title={FabScalar: Composing synthesizable RTL designs of arbitrary cores within a canonical superscalar template}, DOI={10.1145/2000064.2000067}, abstractNote={A growing body of work has compiled a strong case for the single-ISA heterogeneous multi-core paradigm. A single-ISA heterogeneous multi-core provides multiple, differently-designed superscalar core types that can streamline the execution of diverse programs and program phases. No prior research has addressed the “Achilles' heel” of this paradigm: design and verification effort is multiplied by the number of different core types. This work frames superscalar processors in a canonical form, so that it becomes feasible to quickly design many cores that differ in the three major superscalar dimensions: superscalar width, pipeline depth, and sizes of structures for extracting instruction-level parallelism (ILP). From this idea, we develop a toolset, called FabScalar, for automatically composing the synthesizable register-transfer-level (RTL) designs of arbitrary cores within a canonical superscalar template. The template defines canonical pipeline stages and interfaces among them. A Canonical Pipeline Stage Library (CPSL) provides many implementations of each canonical pipeline stage, that differ in their superscalar width and depth of sub-pipelining. An RTL generation tool uses the template and CPSL to automatically generate an overall core of desired configuration. Validation experiments are performed along three fronts to evaluate the quality of RTL designs generated by FabScalar: functional and performance (instructions-per-cycle (IPC)) validation, timing validation (cycle time), and confirmation of suitability for standard ASIC flows. With FabScalar, a chip with many different superscalar core types is conceivable.}, journal={ISCA 2011: Proceedings of the 38th annual international symposium on computer architecture}, author={Choudhary, N. K. and Wadhavkar, S. V. and Shah, T. A. and Mayukh, H. and Gandhi, J. and Dwiel, B. H. and Navada, S. and Najaf-abadi, H. H. and Rotenberg, E.}, year={2011}, pages={11–22} } @article{navada_choudhary_rotenberg_2010, title={Criticality-driven Superscalar Design Space Exploration}, ISBN={["978-1-4503-0178-7"]}, DOI={10.1145/1854273.1854308}, abstractNote={It has become increasingly difficult to perform design space exploration (DSE) of computer systems with a short turnaround time because of exploding design spaces, increasing design complexity and long-running workloads. Researchers have used classical search/optimization techniques like simulated annealing, genetic algorithms, etc., to accelerate the DSE. While these techniques are better than an exhaustive search, a substantial amount of time must still be dedicated to DSE. This is a serious bottleneck in reducing research/development time. These techniques do not perform the DSE quickly enough, primarily because they do not leverage any insight as to how the different design parameters of a computer system interact to increase or degrade performance at a design point and treat the computer system as a “black-box”.}, journal={PACT 2010: PROCEEDINGS OF THE NINETEENTH INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES}, author={Navada, Sandeep and Choudhary, Niket K. and Rotenberg, Eric}, year={2010}, pages={261–272} } @article{najaf-abadi_choudhary_rotenberg_2009, title={Core-Selectability in Chip Multiprocessors}, ISBN={["978-0-7695-3771-9"]}, ISSN={["1089-795X"]}, DOI={10.1109/pact.2009.44}, abstractNote={The centralized structures necessary for the extraction of instruction-level parallelism (ILP) are consuming progressively smaller portions of the total die area of chip multiprocessors (CMP). The reason for this is that scaling these structures does not enhance general performance as much as scaling the cache and interconnect. However, the fact that these structures now consume less proportional die area opens an avenue to enhancing their performance through truly overcoming the one-size-fits-all approach to their design. This paper proposes core-selectability – incorporating differently-designed cores that can be toggled into active employment. This enables differently customized ILP-extracting structures to be at hand in the system while not dramatically adding to the interconnect complexity. The design verification effort is minimized by separating the complexity of different core designs. Moreover, contrary to alternative approaches, the performance and power efficiency of the core designs are not compromised. Evaluation results are presented that show that, even when limiting the diversity between core designs to only the sizing of microarchitectural structures, core-selectability has the potential to provide notable performance enhancement (with an average of 10%) to scalable multithreaded applications, without increased concurrency. In addition, it can provide significantly greater throughput to multiprogrammed workloads by providing the potential for the system to transform into a heterogeneous design.}, journal={18TH INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES, PROCEEDINGS}, author={Najaf-abadi, Hashem H. and Choudhary, Niket K. and Rotenberg, Eric}, year={2009}, pages={113–122} }