@article{zhao_davis_2021, title={A Virtual Platform for Object Detection Systems}, ISSN={["2164-0157"]}, DOI={10.1109/3DIC52383.2021.9687602}, abstractNote={Computer vision is increasingly effective and important in many applications, including disease diagnosis, sports, and autonomous-driving. Visual recognition tasks, such as image classification and object detection, are the key of many of these applications, and recent developments in convolutional neural networks (CNNs) have made outstanding leaps in performance. Therefore, optimizing the data-flow between the image sensor and CNNs now constitute the majority of the effort in computer vision system design. System performance is sensitive to the qualities of the image sensor and CNN hardware accelerator. We focus on determining the influence of the sensor and accelerator on the overall performance and power of an object detection inference task. Because the relationship between image sensor quality and CNN performance is complex, we use image quality as a bridge when evaluating system performance. Developing a new product is very expensive and time consuming. This paper will offer an virtual platform for object detection systems, and each component in the system will be simulated by a proper power model and a behavior model. The power, performance, and area of the complete system will be predicted to help designers optimize object detection systems.}, journal={2021 IEEE INTERNATIONAL 3D SYSTEMS INTEGRATION CONFERENCE (3DIC)}, author={Zhao, Qianli and Davis, W. Rhett}, year={2021} } @article{wang_davis_2021, title={An Instruction-Level Power and Energy Model for the Rocket Chip Generator}, ISSN={["1533-4678"]}, DOI={10.1109/ISLPED52811.2021.9502485}, abstractNote={As digital systems become more power and energy constrained, the need for optimizing these quantities early in the design process grows ever more important. Fast and accurate power and energy models are needed for complex hardware blocks, such as processor cores, in order to optimize systems that contain these blocks. Today accurate energy/power estimation can be achieved only after physical design is complete, which is too late to affect the system architecture. This paper demonstrates the development of a fast instruction-level model for the Rocket Chip Generator to facilitate power- and energy-efficient software optimization. We first discuss an event-based power modeling methodology which is the foundation of our model and is compatible with emerging power- and energy-modeling standards such as IEEE-2416. Detailed energy characterization for basic events is explained along with an evaluation of a model with and without cache-fill events. The validation results show that the proposed instruction-level power model achieves less than 3% error on simple C program benchmarks.}, journal={2021 IEEE/ACM INTERNATIONAL SYMPOSIUM ON LOW POWER ELECTRONICS AND DESIGN (ISLPED)}, author={Wang, Zhiping and Davis, W. Rhett}, year={2021} } @inproceedings{kalafala_fu_davis_aadithya_clevenger_2021, title={Bridging the Organization Gap for EDA Machine Learning Data}, author={Kalafala, K. and Fu, H. and Davis, W.R. and Aadithya, K. and Clevenger, L.A.}, year={2021}, month={Aug} } @article{francisco_franzon_davis_2021, title={Fast and Accurate PPA Modeling with Transfer Learning}, DOI={10.1109/MLCAD52597.2021.9531109}, abstractNote={The power, performance, and area (PPA) of a System-on-Chip (SoC) is known only after a months-long process. This process includes iterations over the architectural design, register transfer level implementation, RTL synthesis, and place and route. Knowing the PPA estimates for a system early in the design stages can help resolve tradeoffs that will affect the final design. This work presents a machine learning approach using gradient boost models and neural networks to fast and accurately predict the PPA. This work focuses on reducing the number of samples used to create the models. The models use transfer learning to predict the PPA for new design configurations and corner conditions based on previous models. The models predict the PPA as a function of parameters accessible during the RTL synthesis. The proposed models achieved PPA predictions up to 99% accurate and using as few as 10 data samples can achieve accuracies better than 96%.}, journal={2021 ACM/IEEE 3RD WORKSHOP ON MACHINE LEARNING FOR CAD (MLCAD)}, author={Francisco, Luis and Franzon, Paul and Davis, W. Rhett}, year={2021} } @inproceedings{davis_franzon_francisco_huggins_jain_2021, title={Fast and Accurate PPA Modeling with Transfer Learning}, ISSN={["1933-7760"]}, DOI={10.1109/ICCAD51958.2021.9643533}, abstractNote={The power, performance and area (PPA) of digital blocks can vary 10:1 based on their synthesis, place, and route tool recipes. With rapid increase in number of PVT corners and complexity of logic functions approaching 10M gates, industry has an acute need to minimize the human resources, compute servers, and EDA licenses needed to achieve a Pareto optimal recipe. We first present models for fast accurate PPA prediction that can reduce the manual optimization iterations with EDA tools. Secondly we investigate techniques to automate the PPA optimization using evolutionary algorithms. For PPA prediction, a baseline model is trained on a known design using Latin hypercube sample runs of the EDA tool, and transfer learning is then used to train the model for an unseen design. For a known design the baseline needed 150 training runs to achieve a 95% accuracy. With transfer learning the same accuracy was achieved on a different (unseen) design in only 15 runs indicating the viability of transfer learning to generalize PPA models. The PPA optimization technique, based on evolutionary algorithms, effectively combines the PPA modeling and optimization. Our approach reached the same PPA solution as human designers in the same or fewer runs for a CORTEX-M0 system design. This shows potential for automating the recipe optimization without needing more runs than a human designer would need.}, booktitle={2021 IEEE/ACM International Conference On Computer Aided Design (ICCAD)}, author={Davis, W.R. and Franzon, P. and Francisco, L. and Huggins, B. and Jain, R.}, year={2021} } @misc{sadangi_pasumarthy_pitts_davis_2021, title={FreePDK3: A Novel PDK for Physical Verification at the 3nm Node}, author={Sadangi, S. and Pasumarthy, V. and Pitts, W.S. and Davis, W.R.}, year={2021}, month={May} } @inproceedings{frenkil_dhanwada_davis_ratchkov_2021, title={System-Level Power Analysis with IEEE 2416 Power Models}, author={Frenkil, J. and Dhanwada, N. and Davis, R. and Ratchkov, D.}, year={2021}, month={Mar} } @inproceedings{dhanwada_davis_dhanwada_frenkil_2021, title={UPM/IEEE 2416 Power Modeling Standard: A Practitioner’s Perspective}, author={Dhanwada, N. and Davis, R. and Dhanwada, N. and Frenkil, J.}, year={2021}, month={Jul} } @misc{davis_2021, title={Views from the Cloud}, author={Davis, W.R.}, year={2021}, month={Aug} } @misc{davis_2020, title={A Gentle Introduction to the Open Model Interface for Reliability Simulations}, author={Davis, W.R.}, year={2020}, month={Apr} } @misc{davis_shaw_2020, title={An Industry-standard approach toward modeling device aging}, author={Davis, W.R. and Shaw, C.}, year={2020}, month={Apr} } @article{francisco_lagare_jain_chaudhary_kulkarni_sardana_davis_franzon_2020, title={Design Rule Checking with a CNN Based Feature Extractor}, DOI={10.1145/3380446.3430625}, abstractNote={Design rule checking (DRC) is getting increasingly complex in advanced nodes technologies. It would be highly desirable to have a fast interactive DRC engine that could be used during layout. In this work, we establish the proof of feasibility for such an engine. The proposed model consists of a convolutional neural network (CNN) trained to detect DRC violations. The model was trained with artificial data that was derived from a set of 50 SRAM designs. The focus in this demonstration was metal 1 rules. Using this solution, we can detect multiple DRC violations 32x faster than Boolean checkers with an accuracy of up to 92%. The proposed solution can be easily expanded to a complete rule set.}, journal={PROCEEDINGS OF THE 2020 ACM/IEEE 2ND WORKSHOP ON MACHINE LEARNING FOR CAD (MLCAD '20)}, author={Francisco, Luis and Lagare, Tanmay and Jain, Arpit and Chaudhary, Somal and Kulkarni, Madhura and Sardana, Divya and Davis, W. Rhett and Franzon, Paul}, year={2020}, pages={9–14} } @inproceedings{davis_2020, title={EDA Roadmap for Machine Learning & AI Standardization}, author={Davis, W.R.}, year={2020}, month={Jan} } @inproceedings{davis_shaw_hassan_2020, title={How to write a compact reliability model with the Open Model Interface (OMI}, DOI={10.1109/IRPS45951.2020.9128222}, abstractNote={As device structures and process technologies become more complex, they become more expensive to implement. It is therefore essential for profitability to have good yield. Part of achieving this yield is testing for reliability, which is further complicated by new materials being used [1] , [2] . Some EDA vendors have incorporated this need in specific areas but each has a separate solution and nothing in common, leaving the designer with a bewildering choice of different coding-styles to use when testing the reliability on his or her circuit. Semiconductor foundries became aware of this need, and one of them (TSMC [3] ) came up with an approach to modify model parameters through an interface to the simulator called the TSMC Model Interface (TMI). This interface was licensed to the Compact Model Coalition (CMC) [4] , which expanded and extended it, releasing it as Open Model Interface (OMI) to be used as a common platform for all foundries and model developers.}, booktitle={2020 IEEE International Reliability Physics Symposium (IRPS)}, author={Davis, W.R. and Shaw, C. and Hassan, A.Ramadan}, year={2020}, month={May} } @inproceedings{baker_davis_dhanwada_frenkil_ratchkov_2020, title={System Level Power Analysis with UPM}, author={Baker, A. and Davis, W.R. and Dhanwada, N. and Frenkil, J. and Ratchkov, D.}, year={2020}, month={Jul} } @article{park_davis_franzon_2019, title={3-D-DATE: A Circuit-Level Three-Dimensional DRAM Area, Timing, and Energy Model}, volume={66}, ISSN={["1558-0806"]}, DOI={10.1109/TCSI.2018.2868901}, abstractNote={In this paper, we present 3-D-DATE, a circuit-level dynamic random access memory (DRAM) area, timing, and energy model that models both the front and back end of 3-D integrated DRAM designs from 90–16 nm, across a broader range of emerging transistor devices and through-silicon vias. This paper improves upon previous studies by providing detailed process models all the way down to the 16-nm technology node and incorporating DRAMs implemented with emerging gate transistor devices. Finally, we validate the model against both several commodity planar and 3-D DRAMs, from 80- to 30-nm process nodes, with the following metrics: energy with a mean error of 5%–1% and a standard deviation up to 9.8%, speed with a mean error of 13%–27%, and a standard deviation up to 24% and area within 3%–1% and a standard a standard deviation up to 4.2%.}, number={2}, journal={IEEE TRANSACTIONS ON CIRCUITS AND SYSTEMS I-REGULAR PAPERS}, author={Park, Jong Beom and Davis, William Rhett and Franzon, Paul D.}, year={2019}, month={Feb}, pages={756–768} } @inproceedings{gupta_davis_2019, title={Characterization of Fast, Accurate Leakage Power Models for IEEE P2416}, DOI={10.1109/ISQED.2019.8697565}, abstractNote={Recent results in power modeling for the emerging IEEE Standard Association project P2416 are presented. The standard promises to bring accurate power-modeling capability to system-level digital design tools. This paper presents the leakage power-model characterization approach in detail in the context of a simple four-gate circuit. Simulation results with a commercial 45nm technology over a range of temperatures and supply voltages show agreement with SPICE to within 1.8% on average and 16.6% in the worst case with a speedup of over 15,000X.}, booktitle={20th International Symposium on Quality Electronic Design (ISQED)}, author={Gupta, B. and Davis, W.R.}, year={2019} } @inproceedings{bhanushali_davis_2019, title={Development of FreePDK: An Open-Source Process Design Kit for Advanced Technology Nodes}, author={Bhanushali, K. and Davis, W.R.}, year={2019}, month={Mar} } @inproceedings{huggins_davis_franzon_2019, title={Estimating Pareto Optimum Fronts to Determine Knob Settings in Electronic Design Automation Tools}, DOI={10.1109/ISQED.2019.8697576}, abstractNote={The ability to configure physical design tools is often dependent on the experience and knowledge of the physical designer (PD). Technology node sizes are ever decreasing, digital design sizes vary drastically, and design constraints change based on the needs of the application. As these changes occur frequently and physical design times can be extensive, the need for accurate quality of design results early in the design process is crucial. Collecting these metrics is computationally expensive, creating a need to determine how to best create and extract information as design flows change. This paper describes the use of estimated Pareto optimal trade-off sets to provide designers with the capability of visualizing the results of Electronic Design Automation (EDA) tool configuration settings, or “knobs”, that will offer an optimal post detail route design based on two design metrics, critical path length and core area. We will show that when given a set of design constraints by creating a point that occurs along the Pareto front, the knob settings used are optimal. With only 38 samples per design, we were able to produce estimated detail routed design metrics with a worst case error (WCE) of less than 10%.}, booktitle={20th International Symposium on Quality Electronic Design (ISQED)}, author={Huggins, B. and Davis, W.R. and Franzon, P.}, year={2019} } @inproceedings{davis_2019, place={Las Vegas, NV}, title={Is Your AI-based EDA Tool Production-Ready?}, author={Davis, W.R.}, year={2019}, month={Jun} } @misc{davis_2019, title={OpenAccess Partitions: How Fast Can We Go?}, author={Davis, W.R.}, year={2019}, month={Oct} } @inproceedings{davis_2019, title={OpenAccess Partitions: How Fast Can We Go?}, author={Davis, W.R.}, year={2019}, month={Jun} } @inproceedings{harris_davis_lipa_pitts_franzon_2019, title={Vertical Stack Thermal Characterization of Heterogeneous Integration and Packages}, DOI={10.1109/3DIC48104.2019.9058784}, abstractNote={This paper presents thermal measurement data of GaN HEMT on CMOS heterogeneous integration (HI) using a Diverse Accessible Heterogeneous Integration (DAHI) process. Thermal T3ster measurements, a product and service available from Mentor are presented. The method uses thermal transients to characterize the vertical thermal path stack including the package. Here the thermal dominance of the thermal interface at the die attachment is apparent. The T3ster measurements are contrasted with in-channel micro-Raman thermal measurements along with simulated results.}, booktitle={2019 International 3D Systems Integration Conference (3DIC)}, publisher={System Integration Conf}, author={Harris, T.R. and Davis, W.R. and Lipa, S. and Pitts, W.S. and Franzon, P.D.}, year={2019} } @misc{davis_2018, title={Physical Design of a Stacked Heterogeneous Multi-Core Processor}, author={Davis, W.R.}, year={2018}, month={Jan} } @inproceedings{srinivasan_chowdhury_forbes_widialaksono_zhang_schabel_ku_lipa_rotenberg_davis_et al._2017, title={H3 (heterogeneity in 3D): A logic-on-logic 3D-stacked heterogeneous multi-core processor}, DOI={10.1109/ICCD.2017.30}, abstractNote={A single-ISA heterogeneous multi-core processor(HMP) [2], [7] is comprised of multiple core types that all implement the same instruction-set architecture (ISA) but have different microarchitectures. Performance and energy is optimized by migrating a thread's execution among core types as its characteristics change. Simulation-based studies with two core types, one simple (low power) and the other complex (high performance), has shown that being able to switch cores as frequently as once every 1,000 instructions increases energy savings by 50% compared to switching cores once every 10,000 instructions, for the same target performance [10]. These promising results rely on extremely low latencies for thread migration. Here we present the H3 chip that uses 3D die stacking and novel microarchitecture to implement a heterogeneous multi-core processor (HMP) with low-latency fast thread migration capabilities. We discuss details of the H3 design and present power and performance results from running various benchmarks on the chip. The H3 prototype can reduce power consumption of benchmarks by up to 26%.}, booktitle={2017 IEEE International Conference on Computer Design (ICCD)}, author={Srinivasan, V. and Chowdhury, R. B. R. and Forbes, E. and Widialaksono, R. and Zhang, Z. Q. and Schabel, J. and Ku, S. and Lipa, S. and Rotenberg, E. and Davis, W. R. and et al.}, year={2017}, pages={145–152} } @inproceedings{davis_2017, title={Physical Design of a Stacked Heterogeneous Multi-Core Processor}, author={Davis, W.R.}, year={2017}, month={Sep} } @inproceedings{harris_davis_franzon_2016, title={Novel packaging and thermal measurement for 3D heterogeneous stacks}, ISBN={9781509029402}, url={http://dx.doi.org/10.1109/3dpeim.2016.7570543}, DOI={10.1109/3DPEIM.2016.7570543}, abstractNote={Presents a collection of slides covering the following topics: packaging; thermal measurement: 3D heterogeneous stacks; heterogeneous integration; chip analysis; NCSU thermal test vehicle; EFFP thermal property extraction; bonding; calibration; HEMT; CMOS and air bridge metal measurement.}, booktitle={2016 International Symposium on 3D Power Electronics Integration and Manufacturing (3D-PEIM)}, publisher={IEEE}, author={Harris, T.R. and Davis, W.R. and Franzon, P.}, year={2016} } @inproceedings{widialaksono_chowdhury_zhang_schabel_lipa_rotenberg_davis_franzon_2016, title={Physical Design of a 3D-Stacked Heterogeneous Multi-Core Processor}, ISBN={9781509013999}, url={http://dx.doi.org/10.1109/3dic.2016.7970036}, DOI={10.1109/3DIC.2016.7970036}, abstractNote={With the end of Dennard scaling, three dimensional stacking has emerged as a promising integration technique to improve microprocessor performance. In this paper we present a 3D-SIC physical design methodology for a multi-core processor using commercial off-the-shelf tools. We explain the various flows involved and present the lessons learned during the design process. The logic dies were fabricated with GlobalFoundries 130 nm process and were stacked using the Ziptronix face-to-face (F2F) bonding technology. We also present a comparative analysis which highlights the benefits of 3D integration. Results indicate an order of magnitude decrease in wirelengths for critical inter-core components in the 3D implementation compared to 2D implementations.}, booktitle={2016 IEEE International 3D Systems Integration Conference (3DIC)}, publisher={IEEE}, author={Widialaksono, R.H. and Chowdhury, R.B.R. and Zhang, Z. and Schabel, J. and Lipa, S. and Rotenberg, E. and Davis, W.R. and Franzon, P}, year={2016} } @inproceedings{harris_pavlidis_wyers_newberry_graham_franzon_davis_2016, title={Thermal raman and IR measurement of heterogeneous integration stacks}, DOI={10.1109/ITHERM.2016.7517727}, abstractNote={Thermal management and planning is important for heterogeneous integration due to the introduction of a complex thermal path. Thermal measurement of operating devices provides necessary data points for future design as well as validation of models. In this paper, two methods for measuring thermal performance of DAHI (Diverse Accessible Heterogeneous Integration) GaN HEMTs are presented and contrasted: IR microscopy and micro Raman spectroscopy. The QFI IR system uses a per-pixel material emissivity flat temperature calibration when the device is in an off-state, and then calculates operating temperatures by CCD exposure. Two separate QFI systems with differing CCD resolutions were used to collect thermal data and are compared. Raman Thermometry by contrast, is a laser point measurement of the frequency shift in scattered photons due to phonon vibrational modes whose frequencies are temperature dependent. Differences in measurements between the two methods arising from the stack of materials used in the DAHI process and their transparency are discussed. A method for measuring the surface temperature of the devices through Raman by the use of TiO2 nanoparticles is also presented in conjunction with a profile of the HEMT. Measurements are presented alongside thermal simulation results using prototype software Mentor Graphics™ Calibre®.}, booktitle={2016 15th IEEE Intersociety Conference on Thermal and Thermomechanical Phenomena in Electronic Systems (ITherm)}, author={Harris, T. R. and Pavlidis, G. and Wyers, E. J. and Newberry, D. M. and Graham, S. and Franzon, P. and Davis, W. R.}, year={2016}, pages={1505–1510} } @inproceedings{franzon_rotenberg_davis_tuck_davis_zhou_schabel_zhang_dwiel_forbes_et al._2015, place={Singapore}, title={Computing in 3D}, ISBN={9781467393850}, url={http://dx.doi.org/10.1109/3dic.2015.7334571}, DOI={10.1109/3DIC.2015.7334571}, abstractNote={3DIC technology refers to stacking and interconnecting chips and substrates (“interposers”) with Through Silicon Vias (TSVs). Industry is gearing up for widespread introduction of this technology with the 22 nm node. We have been pursuing a range of approaches to enable low power computing. As well as 3DIC these include heterogeneous computing, powered optimized SIMD units, optimized memory hierarchies, and MPI with post-silicon customized interconnect. Heterogeneous computing refers to the concept of building a mix of CPUs and memories that in turn enable in-situ tuning of the compute load to the compute resources. We introduce the concept of Fast Thread Migration using 3DIC technologies. We present the design of a power optimized SIMD unit in which over half of the power is employed in the FP units. A parallel computer is built using an MPI paradigm. Codes are analyzed so that the MPI interconnect can be power optimized post-silicon. Emerging 3D memories have potential to be employed as Level 2 and Level 3 caches, and this is explored using the Tezzaron 3D memory. As scaling and power optimization occurs, the main memory increasingly dominates the power consumption. Possible extensions to Cortical Processing are discussed.}, booktitle={2015 International 3D Systems Integration Conference (3DIC)}, publisher={IEEE}, author={Franzon, Paul D. and Rotenberg, Eric and Davis, W. Rhett and Tuck, James and Davis, W. Rhett and Zhou, Huiyang and Schabel, Joshua and Zhang, Zhenquian and Dwiel, J. Brandon and Forbes, Elliott and et al.}, year={2015} } @inbook{bhanushali_davis_2015, title={FreePDK15: An Open-Source Predictive Process Design Kit for 15nm FinFET Technology}, DOI={10.1145/2717764.2717782}, abstractNote={This paper discusses design rules and layout guidelines for an open source predictive process design kit (PDK) for multi-gate 15nm FinFET devices. Additional design rules are introduced considering process variability, and challenges involved in fabrication beyond 20nm. Particularly, double patterning lithography is assumed and a unique set of design rules are developed for critical dimensions. In order to improve the FinFET layout density, Middle-of-line local interconnect layers are implemented for the FinFET layout. The rules are further validated by running Calibre design-rule checks on Virtuoso layout of an Inverter and NAND4 cells. As part of the validation process, the area of a FreePDK15 inverter was compared to the area of an inverter in 45nm bulk MOS process and the ratio was found to be 1:6. This kit primarily aims to support introduction of sub-20nm FinFET devices into research and universities.}, booktitle={ISPD '15: Proceedings of the 2015 Symposium on International Symposium on Physical Design}, author={Bhanushali, K. and Davis, W.R.}, year={2015}, pages={165–170} } @article{harris_wyers_wang_graham_pavlidis_franzon_davis_2015, title={Thermal simulation of heterogeneous GaN/ InP/silicon 3DIC stacks}, ISBN={9781467393850}, url={http://dx.doi.org/10.1109/3dic.2015.7334616}, DOI={10.1109/3DIC.2015.7334616}, abstractNote={Integration of materials such as GaN, InP, SiGe, and Si is a natural extension of the 3D-IC perspective and provides a unique solution for high performance circuits. In this approach, application of a component is no longer dependent on semiconductor material selection. In this paper, preliminary results are presented which examine the thermal performance of the technology. A thermal analysis prototype solution in Mentor Graphics™ Calibre® provides surface heat maps based on IC layout, material property, and geometric configuration files. Chiplets are connected by heterogeneous interconnect (HIC). Differences in thermal performance of GaN and InP chiplets are explored by varying the number of HICs. Two methods for building up the model of a test chip are compared. One method uses custom scripts to place discrete blocks in the model to represent HICs, while the other uses thermal material properties extracted from the layout. Measurements presented confirm simulated results.}, journal={2015 International 3D Systems Integration Conference (3DIC)}, publisher={IEEE}, author={Harris, T.R. and Wyers, E.J. and Wang, L. and Graham, S. and Pavlidis, G. and Franzon, P. and Davis, W.R.}, year={2015}, pages={1–3} } @article{chen_zhu_davis_franzon_2014, title={Adaptive and Reliable Clock Distribution Design for 3-D Integrated Circuits}, volume={4}, ISSN={["2156-3985"]}, DOI={10.1109/tcpmt.2014.2361356}, abstractNote={In this paper, we present novel techniques to handle the complexity and challenges in clock distribution for 3-D integrated circuit. First, we propose a novel active deskew technique to adaptively mitigate the cross-tier variations and the 3-D wiring asymmetry. The new deskew technique neither relies on an accurate through-silicon-vias model nor an accurate reference clock distribution. Second, we design a phase-mixer-based tunable-delay-buffer (TDB), which can be linearly tuned in 360° and tolerant to process-voltage-termperature (PVT) variations. Third, based on the new deskew technique and TDB design, we propose an efficient clock distribution network topology, which can be realized without a need of balanced H-tree. Moreover, a thermal profile-based optimization flow is developed to further improve the power efficiency and reduce design overhead. A case study shows that the proposed techniques are able to largely improve the clock skews. The optimization flow is capable of reducing the design cost to achieve a better tradeoff of the skew performance and the design overhead.}, number={11}, journal={IEEE TRANSACTIONS ON COMPONENTS PACKAGING AND MANUFACTURING TECHNOLOGY}, author={Chen, Xi and Zhu, Ting and Davis, William Rhett and Franzon, Paul D.}, year={2014}, month={Nov}, pages={1862–1870} } @inbook{priyadarshi_hu_steer_franzon_davis_2014, place={Boca Raton, FL}, title={Electrothermal Simulation of Three Dimensional Integrated Circuits}, ISBN={9781466589407 9781466589421}, booktitle={Design of 3D Integrated Circuits and Systems}, publisher={CRC Press}, author={Priyadarshi, S. and Hu, J. and Steer, M.B. and Franzon, P.D. and Davis, W.R.}, editor={Sharma, R.Editor}, year={2014} } @inproceedings{priyadarshi_davis_franzon_2014, title={Pathfinder3D: A framework for exploring early thermal tradeoffs in 3DIC}, ISBN={9781479921539}, url={http://dx.doi.org/10.1109/icicdt.2014.6838612}, DOI={10.1109/icicdt.2014.6838612}, abstractNote={Three dimensional integration technologies offer significant potential to improve performance, performance per unit power and integration density. However, increased power density and thermal resistances leading to higher on-chip temperature is imposing several implementation challenges and restricting widespread adaptation of this technology. This necessitates the need for CAD flows and tools facilitating early thermal evaluation of possible 3D design choices and thermal management techniques. This paper presents a CAD flow and associated framework called Pathfinder3D, which facilitates physically-aware system-level thermal simulation of 3DICs. Usage of Pathfinder3D is shown using a case study comparing thermal profiles of 2D and three 3D implementations of a quadcore chip multiprocessor.}, booktitle={2014 IEEE International Conference on IC Design & Technology}, publisher={IEEE}, author={Priyadarshi, Shivam and Davis, W. Rhett and Franzon, Paul D.}, year={2014}, month={May} } @article{priyadarshi_davis_steer_franzon_2014, title={Thermal Pathfinding for 3-D ICs}, volume={4}, ISSN={["2156-3985"]}, DOI={10.1109/tcpmt.2014.2321005}, abstractNote={System architects traditionally use high-level models of component blocks to predict trends for various design metrics. However, with continually increasing design complexity and a confusing array of manufacturing choices, system-level design decisions cannot be made without considering physical-level details. This effect is more pronounced for 3-D integrated circuits (ICs) because it provides a plethora of physical-level design choices, such as the number of stacking layers and the type of 3-D bonding method, along with the choices provided by 2-D ICs. Thus, it is necessary for system-level flows to predict the complex interactions among system performance, power, temperature, floorplanning, process technology, computer architecture, and software/workloads. This is often called pathfinding. This paper presents a pathfinding flow that integrates SystemC transaction-level electrical and dynamic thermal simulations. The goal of this flow is to pass complex physical constraints to system architects in a convenient form. The applicability of the proposed flow is shown using an example stacking of two processor cores and L2 cache in two-tier 3-D stack.}, number={7}, journal={IEEE TRANSACTIONS ON COMPONENTS PACKAGING AND MANUFACTURING TECHNOLOGY}, author={Priyadarshi, Shivam and Davis, W. Rhett and Steer, Michael B. and Franzon, Paul D.}, year={2014}, month={Jul}, pages={1159–1168} } @inproceedings{dhanwada_davis_frenkil_2014, title={Towards a Standard Flow for System Level Power Modeling}, DOI={10.1109/ICCAD.2014.7001333}, abstractNote={Power efficiency is a key design objective for most SoCs today and designers continue to search for new approaches to low power design. As transistor level, gate level and RTL methods have become well understood and widely adopted, interest has grown in power aware system design. This interest has arisen along with the overall growth and adoption of SystemC for functional modeling and simulation. In a comprehensive power aware flow, power analyses and optimizations occur during all three major design phases: System Design, RTL Design, and Implementation. These activities require models that represent the power characteristics of each design element. However, unlike RTL Design and Implementation, System Design has no standard power modeling or analysis mechanisms. This lack of abstract, system level power models inhibits system level power analysis: where models are unavailable the flow is unrealized, where models are available the accuracy and flexibility is often limited. This issue motivated the development of modeling capabilities for IP block abstract power models for use in all phases of SoC design. This development built upon existing gate level modeling semantics and flows. This presentation will begin with an overview of existing gate level power modeling capabilities, using the Liberty modeling language as the example. The interpretation of the models by power calculation applications will be described, including the interaction between power models and simulation data. Requirements beyond the existing gate level capabilities will be described. Key requirements include black-box and grey-box modeling styles, methods for handling the exponential explosion of power states and power state transitions, automatic model generation, power component categorization, and descriptions of power structure and power operation. Some of these requirements have already been implemented while others are in the proposal stage. Example usage of such a system level model will be illustrated with a Transaction Level (TLM) Simulation. The example will illustrate how the model is used to produce dynamic and leakage power calculations from the TLM simulation data.}, booktitle={2014 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)}, author={Dhanwada, N. and Davis, R. and Frenkil, J.}, year={2014} } @inproceedings{franzon_rotenberg_tuck_davis_zhou_schabel_zhang_park_dwiel_forbes_et al._2013, title={Applications and design styles for 3DIC}, ISBN={9781479923069}, url={http://dx.doi.org/10.1109/iedm.2013.6724717}, DOI={10.1109/IEDM.2013.6724717}, abstractNote={3D technologies offer significant potential to improve raw performance and performance per unit power. After exploiting TSV technologies for cost reduction and increasing memory bandwidth, the next frontier is to create more sophisticated solutions that promise further increases in power/performance beyond those attributable to memory interfaces alone. These include heterogeneous integration and exploitation of the high amounts of interconnect available to provide for customization. Challenges include the creation of physical standards and the design of sophisticated static and dynamic thermal management methods.}, booktitle={2013 IEEE International Electron Devices Meeting}, publisher={IEEE}, author={Franzon, P.D. and Rotenberg, E. and Tuck, J. and Davis, W.R. and Zhou, H. and Schabel, J. and Zhang, Z. and Park, J. and Dwiel, B. and Forbes, E. and et al.}, year={2013} } @inproceedings{tshibangu_franzon_rotenberg_davis_2013, title={Design of controller for L2 cache mapped in Tezzaron stacked DRAM}, DOI={10.1109/3dic.2013.6702397}, abstractNote={3DIC technology allows implementation of fast and dense memory by allowing multiple layers of DRAM to be fabricated in a single die called Die-stacking technology. This creates opportunity to explore usage of DRAM as fast last level cache by exploiting mapping of data and tag in the same bank. This Paper investigates the implementation of such a cache controller using 3-layer 256 MB Tezzaron Octopus stacked DRAM. This memory provides a fast data access through burst-4 and burst-8 mode. To avoid multiple row activation, the entire set is confined in one row of 2KB. For a 64B cache block, 32 lines of data can be obtained in one row. In this design, only two cache blocks are used for tag while 30 blocks are used for data yielding a 30-way set associative L2 cache. Given the performance of Tezzaron memory, a low hit time of approximately 20 cycles was achieved. This hit latency includes precharge and row activation delays. This access latency was used in Gem5 full-system simulator to estimate the performance compared to a standard 2D SRAM L2 cache. An average of 15% on performance is achieved on different benchmarks while providing an average 27% on energy saving.}, booktitle={2013 IEEE International 3D Systems Integration Conference (3DIC)}, author={Tshibangu, N. M. and Franzon, P. D. and Rotenberg, E. and Davis, W. R.}, year={2013}, month={Oct} } @inproceedings{franzon_priyadarshi_lipa_davis_thorolfsson_2013, title={Exploring early design tradeoffs in 3DIC}, DOI={10.1109/iscas.2013.6571901}, abstractNote={This The key to gaining substantial benefit from the use of 3DIC technology is to create 3D specific designs that do more than recast a 2D optimal design into the third dimension. This paper explores some of the approaches to creating 3D specific designs and the CAD tools that can help in that exploration. The power advantages of 3D design are illustrated in details. Results from different partitioning approaches (function, modular and circuit) are presented, together with early results from a thermal pathfinding tool.}, booktitle={2013 IEEE International Symposium on Circuits and Systems (ISCAS)}, author={Franzon, P. D. and Priyadarshi, S. and Lipa, S. and Davis, W. R. and Thorolfsson, T.}, year={2013}, pages={545–549} } @inproceedings{priyadarshi_choudhary_dwiel_upreti_rotenberg_davis_franzon_2013, title={Hetero(2) 3d integration: A scheme for optimizing efficiency/cost of chip multiprocessors}, DOI={10.1109/isqed.2013.6523582}, abstractNote={Timing the transition of a processor design to a new technology poses a provocative tradeoff. On the one hand, transitioning as early as possible offers a significant competitive advantage, by bringing improved designs to market early. On the other hand, an aggressive strategy may prove to be unprofitable, due to the low manufacturing yield of a technology that has not had time to mature. We propose exploiting two complementary forms of heterogeneity to profitably exploit an immature technology for Chip Multiprocessors (CMP). First, 3D integration facilitates a technology alloy. The CMP is split across two dies, one fabricated in the old technology and the other in the new technology. The alloy derives benefit from the new technology while limiting cost exposure. Second, to compensate for lower efficiency of old-technology cores, we exploit application and microarchitectural heterogeneity: applications which gain less from technology scaling are scheduled on old-technology cores, moreover, these cores are retuned to optimize this class of application. For a defect density ratio of 200 between 45nm and 65nm, Hetero2 3D gives 3.6× and 1.5× higher efficiency/cost compared to 2D and 3D homogeneous implementations, respectively, with only 6.5% degradation in efficiency. We also present a sensitivity analysis by sweeping the defect density ratio. The analysis reveals the defect density break-even points, where homogeneous 2D and 3D designs in 45nm achieve the same efficiency/cost as Hetero2 3D, marking significant points in the maturing of the technology.}, booktitle={Proceedings of the fourteenth international symposium on quality electronic design (ISQED 2013)}, author={Priyadarshi, S. and Choudhary, N. and Dwiel, B. and Upreti, A. and Rotenberg, E. and Davis, R. and Franzon, P.}, year={2013}, pages={1–7} } @inproceedings{rotenberg_dwiel_forbes_zhang_widialaksono_chowdhury_tshibangu_lipa_davis_franzon_et al._2013, title={Rationale for a 3D heterogeneous multi-core processor}, ISBN={9781479929870}, url={http://dx.doi.org/10.1109/iccd.2013.6657038}, DOI={10.1109/ICCD.2013.6657038}, abstractNote={Single-ISA heterogeneous multi-core processors are comprised of multiple core types that are functionally equivalent but microarchitecturally diverse. This paradigm has gained a lot of attention as a way to optimize performance and energy. As the instruction-level behavior of the currently executing program varies, it is migrated to the most efficient core type for that behavior.}, booktitle={2013 IEEE 31st International Conference on Computer Design (ICCD)}, publisher={IEEE}, author={Rotenberg, Eric and Dwiel, Brandon H. and Forbes, Elliott and Zhang, Zhenqian and Widialaksono, Randy and Chowdhury, Rangeen Basu Roy and Tshibangu, Nyunyi and Lipa, Steve and Davis, W. Rhett and Franzon, Paul and et al.}, year={2013}, pages={154–168} } @article{harris_priyadarshi_melamed_ortega_manohar_dooley_kriplani_davis_franzon_steer_et al._2012, title={A Transient Electrothermal Analysis of Three-Dimensional Integrated Circuits}, volume={2}, ISSN={["2156-3985"]}, DOI={10.1109/tcpmt.2011.2178414}, abstractNote={A transient electrothermal simulation of a 3-D integrated circuit (3DIC) is reported that uses dynamic modeling of the thermal network and hierarchical electrothermal simulation. This is a practical alternative to full transistor electrothermal simulations that are computationally prohibitive. Simulations are compared to measurements for a token-generating asynchronous 3DIC clocking at a maximum frequency of 1 GHz. The electrical network is based on computationally efficient electrothermal macromodels of standard and custom cells. These are linked in a physically consistent manner with a detailed thermal network extracted from an OpenAccess layout file. Coupled with model-order reduction techniques, hierarchical dynamic electrothermal simulation of large 3DICs is shown to be tractable, yielding spatial and temporal selected transistor-level thermal profiles.}, number={4}, journal={IEEE TRANSACTIONS ON COMPONENTS PACKAGING AND MANUFACTURING TECHNOLOGY}, author={Harris, T.R. and Priyadarshi, S. and Melamed, S. and Ortega, C. and Manohar, R. and Dooley, S.R. and Kriplani, N.M. and Davis, W.R. and Franzon, Paul and Steer, M.B. and et al.}, year={2012}, month={Apr}, pages={660–667} } @article{moezzi-madani_thorolfsson_chiang_davis_2012, title={Area-Efficient Antenna-Scalable MIMO Detector for K-best Sphere Decoding}, volume={68}, ISSN={["1939-8115"]}, DOI={10.1007/s11265-011-0595-9}, number={2}, journal={JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNAL IMAGE AND VIDEO TECHNOLOGY}, author={Moezzi-Madani, Nariman and Thorolfsson, Thorlindur and Chiang, Patrick and Davis, William Rhett}, year={2012}, month={Aug}, pages={171–182} } @article{priyadarshi_harris_melamed_otero_kriplani_christoffersen_manohar_dooley_davis_franzon_et al._2012, title={Dynamic electrothermal simulation of three-dimensional integrated circuits using standard cell macromodels}, volume={6}, ISSN={1751-858X}, url={http://dx.doi.org/10.1049/iet-cds.2011.0061}, DOI={10.1049/iet-cds.2011.0061}, abstractNote={Physics-based compact electrothermal macromodels of standard cells are developed for fast dynamic simulation of three-dimensional integrated circuits (3DICs). Such circuits can have high thermal densities and thermal effects often limit their performance. The macromodels developed here use fewer state-variables than a discrete transistor-level implementation while retaining transistor-level accuracy. This results in significant speed-up over transistor-level simulation for large-scale circuits. The macromodel-based methodology enables robust and significantly faster dynamic electrothermal simulation over the long times required for thermal transients to subside. Consequently, transient junction temperature can be examined in the design phase. Simulated junction and measured surface thermal transients are compared.}, number={1}, journal={IET Circuits, Devices & Systems}, publisher={Institution of Engineering and Technology (IET)}, author={Priyadarshi, S. and Harris, T.R. and Melamed, S. and Otero, C. and Kriplani, N.M. and Christoffersen, C.E. and Manohar, R. and Dooley, S.R. and Davis, W.R. and Franzon, P.D. and et al.}, year={2012}, pages={35} } @article{melamed_thorolfsson_harris_priyadarshi_franzon_steer_davis_2012, title={Junction-level thermal analysis of 3-D integrated circuits using high definition power blurring}, volume={31}, DOI={10.1109/tcad.2011.2180384}, abstractNote={The degraded thermal path of 3-D integrated circuits (3DICs) makes thermal analysis at the chip-scale an essential part of the design process. Performing an appropriate thermal analysis on such circuits requires a model with junction-level fidelity; however, the computational burden imposed by such a model is tremendous. In this paper, we present enhancements to two thermal modeling techniques for integrated circuits to make them applicable to 3DICs. First, we present a resistive mesh-based approach that improves on the fidelity of prior approaches by constructing a thermal model of the full structure of 3DICs, including the interconnect. Second, we introduce a method for dividing the thermal response caused by a heat load into a high fidelity “near response” and a lower fidelity “far response” in order to implement Power Blurring high definition (HD), a hierarchical thermal simulation approach based on Power Blurring that incorporates the resistive mesh-based models and allows for junction-level accuracy at the full-chip scale. The Power Blurring HD technique yields approximately three orders of magnitude of improvement in memory usage and up to six orders of magnitude of improvement in runtime for a three-tier synthetic aperture radar circuit, as compared to using a full-chip junction-scale resistive mesh-based model. Finally, measurement results are presented showing that Power Blurring high definition (HD) accurately determines the shape of the thermal profile of the 3DIC surface after a correction factor is added to adjust for a discrepancy in the absolute temperature values.}, number={5}, journal={IEEE Transactions on Computer-aided Design of Integrated Circuits and Systems}, author={Melamed, S. and Thorolfsson, T. and Harris, T. R. and Priyadarshi, S. and Franzon, Paul and Steer, M. B. and Davis, W. R.}, year={2012}, pages={676–689} } @article{dhanwada_hathaway_frenkil_davis_demircioglu_2012, title={Leakage Power Contributor Modeling}, volume={29}, ISSN={["0740-7475"]}, DOI={10.1109/mdt.2012.2183573}, abstractNote={Low-power or power-aware design is one of the greatest challenges facing the semiconductor industry. The fidelity of low power design is dependent on the accuracy of power modeling across a wide range of PVT values. This paper describes an alternative “power contributor”based approach to cell leakage characterization that exploits inherent separability of power consumption for different portions of a cell. An experimental use of this approach is also presented that demonstrates how the effort to characterize leakage power can be greatly reduced with only a marginal impact on accuracy.}, number={2}, journal={IEEE DESIGN & TEST OF COMPUTERS}, author={Dhanwada, Nagu and Hathaway, David and Frenkil, Jerry and Davis, W. Rhett and Demircioglu, Harun}, year={2012}, pages={71–78} } @inproceedings{davis_2012, title={Modeling Power Variability (from Small to Large}, author={Davis, W.R.}, year={2012}, month={Oct} } @article{priyadarshi_saunders_kriplani_demircioglu_davis_franzon_steer_2012, title={Parallel Transient Simulation of Multiphysics Circuits Using Delay-Based Partitioning}, volume={31}, ISSN={["1937-4151"]}, DOI={10.1109/tcad.2012.2201156}, abstractNote={A parallel transient simulation technique for multiphysics circuits is presented. The technique develops partitions utilizing the inherent delay present within a circuit and between physical domains. A state-variable-based circuit delay element is presented, which implements the coupling between two spatially or temporally isolated circuit partitions. A parallel delay-based iterative approach for interfacing delay-partitioned subcircuits is applied, which achieves the reasonable accuracy of nonparallel circuit simulation if both incorporate the same interblock delay. The partitioned subcircuits are distributed to different cores of a shared-memory multicore processor and solved in parallel. A multithreaded implementation of the methodology using OpenMP is presented. Examples showing superlinear speedup compared to unpartitioned single-core simulation using the direct method are presented. This paper also discusses the impact of load balancing and absolute delay on simulation speedup.}, number={10}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Priyadarshi, Shivam and Saunders, Christopher S. and Kriplani, Nikhil M. and Demircioglu, Harun and Davis, W. Rhett and Franzon, Paul D. and Steer, Michael B.}, year={2012}, month={Oct}, pages={1522–1535} } @article{priyadarshi_hu_choi_melamed_chen_davis_franzon_2012, title={Pathfinder 3D: A Flow for System-Level Design Space Exploration}, ISBN={9781467321907 9781467321891 9781467321884}, url={http://dx.doi.org/10.1109/3dic.2012.6262961}, DOI={10.1109/3DIC.2012.6262961}, abstractNote={Three dimensional integration technology has the potential to provide enhanced performance and device density gains beyond that available from technology scaling alone. However, it provides plethora of design choices for system designers. The full exploitation of the benefits of 3D integration requires a system-level exploration flow which can facilitate in finding an optimal 3D design by comparing possible early design choices. In this paper we present a flow for fast system-level exploration useful for path finding studies. The flow enables users to explore the tradeoff between different stacking and partitioning schemes in terms of performance, power, and temperature. We also present a free open source design kit compiler, FreePDK3D45 and a tool for fast floorplan evaluation of TSV-based digital architectures, Pathfinder3D. The open source design kit and architecture evaluator can help the community to research, learn and explore the various aspects of 3D integration. Using the proposed flow and design kit, we present a case study of 3D integration of a Network on Chip. This case study demonstrates system-level comparisons of the performance, power and temperature of different homogenously partitioned stacking schemes.}, journal={2011 IEEE International 3D Systems Integration Conference (3DIC)}, publisher={IEEE}, author={Priyadarshi, S. and Hu, J. and Choi, W.H. and Melamed, S. and Chen, X. and Davis, W.R. and Franzon, P.D.}, year={2012} } @inproceedings{franzon_davis_thorolfsson_melamed_2011, title={3D specific systems: Design and CAD}, DOI={10.1109/ats.2011.99}, abstractNote={3D stacking and integration can provide significant system advantages. Following a brief technology review, this abstract explores application drivers, design and CAD for 3D ICs. The main 3D exploitation explored in detail is that of logic on memory. This application is explored in a specific DSP example, showing a 25% power advantage when implemented in 3D compared with 2D. Finally critical areas that need better solutions are explored. These include cost management, design planning, test management, and thermal management.}, booktitle={2011 Asian Test Symposium}, author={Franzon, P. D. and Davis, W. R. and Thorolfsson, T. and Melamed, S.}, year={2011}, pages={470–473} } @inproceedings{chen_davis_franzon_2011, title={Adaptive clock distribution for 3D integrated circuits}, DOI={10.1109/epeps.2011.6100195}, abstractNote={Clock distribution in three-dimensional integrated circuits (3D ICs) is faced with many challenges. In this work, we present new techniques for realizing highly adaptive and reliable clock distribution for 3D ICs. Firstly, an efficient clock distribution topology without need of balanced H-tree is proposed. Secondly, a robust tunable-delay-buffer (TDB) circuit and a novel active de-skew method are developed in order to handle the cross-die variations, thermal gradients, and wiring asymmetry. Moreover, a design optimization flow is constructed for improving the adaptive clock design based on the thermal profiles. Experiment results show that the clock skews are significantly reduced using the proposed techniques.}, booktitle={2011 IEEE 20th Conference on Electrical Performance of Electronic Packaging and Systems}, author={Chen, X. and Davis, W. R. and Franzon, P. D.}, year={2011}, pages={91–94} } @inproceedings{moezzi-madani_thorolfsson_crop_chiang_davis_2011, title={An energy-efficient 64-QAM MIMO detector for emerging wireless standards}, DOI={10.1109/DATE.2011.5763050}, abstractNote={A power/area aware design is mandatory for the MIMO (Multi-Input Multi-Output) detectors used in LTE and WiMAX standards. The 64-QAM modulation used in the MIMO detector requires more detection effort compared to the smaller constellation sizes widely implemented in the literature. In this work we propose a new architecture for the K-best detector, which unlike the popular multi-stage architecture used for K-best detectors, implements just one core. Also, we introduce a slight modification to the K-best algorithm that reduces the number of multiplications by 44%, and reduces the total power consumption by 27%, without any noticeable performance degradation. The overall architecture consumes only 24KGate, which is the smallest area compared to the other implementations in the literature. It also results in an at least 4-fold greater throughput-efficiency (Mbps/KiloGate) compared to the other detectors, while consuming a small power. The decoder implemented in a commercial 130nm process provides a data-rate of 107Mbps, and consumes 54.4mW.}, booktitle={2011 Design, Automation & Test in Europe}, author={Moezzi-Madani, N. and Thorolfsson, T. and Crop, J. and Chiang, P. and Davis, W.R.}, year={2011} } @misc{davis_2011, title={Architecture, Design, and CAD for 3D-ICs}, author={Davis, W.R.}, year={2011}, month={Oct} } @article{franzon_davis_zhou_priyadarshi_hogan_karnik_srinivas_2011, title={Coordinating 3D designs: Interface IP, standards or free form?}, ISBN={9781467321907 9781467321891 9781467321884}, url={http://dx.doi.org/10.1109/3dic.2012.6262960}, DOI={10.1109/3DIC.2012.6262960}, abstractNote={Three dimensional integration technology introduces new complexities to design and particularly codesign. Additional complexity is added when one considers that the design needs to be “future-proof”. How do you ensure that the 3D chip stack will work for future chips within the stack, whose parameters are yet to be fully anticipated. This paper proposes that this be managed through an Interface IP approach Design blocks with associated properties that not only supports signaling and power delivery but also constraints that must be managed between chips both during design but also in-situ and as part of physical verification.}, journal={2011 IEEE International 3D Systems Integration Conference (3DIC)}, publisher={IEEE}, author={Franzon, P.D. and Davis, W.R. and Zhou, Zheng and Priyadarshi, S. and Hogan, M. and Karnik, T. and Srinivas, G.}, year={2011} } @inbook{franzon_davis_thorolfsson_2011, place={New York}, title={Design and Computer Aided Design of 3DIC}, DOI={10.1007/978-1-4419-7618-5_4}, abstractNote={This chapter reviews the process of 3DIC designing exploiting Through Silicon Via (TSV) technology. The chapter introduces the notion of re-architecting systems explicitly to exploit high density TSV processes. A particular focus is on (redesigned) memory on top of logic. This article also serves as a tutorial for the design of 3D specific systems.}, booktitle={3D Integration for NoC-based SoC Architectures. Integrated Circuits and Systems}, publisher={Springer}, author={Franzon, P.D. and Davis, W.Rhett and Thorolfsson, Thor}, editor={Sheibanyrad, A. and Pétrot, F. and Jantsch, A.Editors}, year={2011}, pages={75–88} } @inbook{davis_demircioglu_2011, place={Boston, MA}, title={Predictive Process Design Kits}, DOI={10.1007/978-1-4614-0445-3_8}, abstractNote={For nearly half a century, semiconductor technology has continued to deliver exponential growth in the number of transistors on a chip. Even in the 22 nm processes of today, with exponentially increasing costs of research and development, masks, and design, transistors are still cheaper and denser than in previous process nodes. However, the cracks are showing in the industry’s armor. Prior to 2005, each technology generation brought not only lower cost, but also more speed and less power consumption. Today, designers must be much more creative to balance the competing customer needs of cost, speed, and power. One size no longer fits all.}, booktitle={Predictive Technology Model for Robust Nanoelectronic Design. Integrated Circuits and Systems}, publisher={Springer}, author={Davis, W.R. and Demircioglu, H.}, editor={Cao, YuEditor}, year={2011}, pages={121–140} } @inproceedings{davis_2011, title={Test & Reliability Challenges in 3D NoCs}, author={Davis, W.R.}, year={2011}, month={Jun} } @article{chen_zhu_davis_2011, title={Three-dimensional SRAM design with on-chip access time measurement}, volume={47}, ISSN={["1350-911X"]}, DOI={10.1049/el.2010.3701}, abstractNote={An SRAM design in a 3D 0.18 µm silicon-on-insulator technology is presented. A novel delay-locked loop based access time measurement circuit was designed on-chip for accurately evaluating the 3D SRAM performance. Results show that a 32% improvement in the access time is gained by using 3D technology.}, number={8}, journal={ELECTRONICS LETTERS}, author={Chen, X. and Zhu, T. and Davis, W. R.}, year={2011}, month={Apr}, pages={485–486} } @inproceedings{moezzi-madani_thorolfsson_davis_2010, title={A Low-Area Flexible MIMO Detector for WiFi/WiMAX Standards}, DOI={10.1109/DATE.2010.5457073}, abstractNote={MIMO wireless technology is required to increase the data rates for a broad range of applications, including low cost mobile devices. In this paper we present a very low area reconfigurable MIMO detector which achieves a high throughput of 103Mbps and uses 27 Kilo Gates when implemented in a commercial 180nm CMOS process. The low area is achieved by the proposed in-place architecture. This architecture implements the K-best algorithm and reduces area 4-fold compared to the widely used multi-stage architecture, while provides reconfigurability in terms of antenna configuration during real-time operation.}, booktitle={2010 Design, Automation & Test in Europe Conference & Exhibition (DATE 2010)}, author={Moezzi-Madani, N. and Thorolfsson, T. and Davis, W.R.}, year={2010} } @inproceedings{moezzi-madani_thorolfsson_davis_2010, title={Algorithm and hardware complexity reduction techniques for k-best sphere decoders}, DOI={10.1145/1785481.1785589}, abstractNote={MIMO (Multi-Input Multi-Output) technology is garnering more interest in new wireless communication standards. In this work, we introduce three techniques to reduce the power consumption of MIMO detectors and increase their data rate. We decrease the complexity of the K-best sphere decoder effectively by using the MMSE-SQRD channel processing technique. This technique results in a smaller K which results in a great reduction in power consumption compared to the K-best detectors using ZF-SQRD with the same BER/throughput performance. We also propose a child reduction technique that reduces the number of multiplications and additions which results in 8% power reduction. Also, we utilized the odd-even merge algorithm for the merge unit which is on the critical path of the circuit, to achieve the best power/throughput tradeoff. We implemented a 4*4 16QAM detector in a commercial 0.18µm CMOS process; synthesis results show that the detector works at the maximum data rate of 768Mbps with the area of 91KGates.}, booktitle={GLSVLSI '10: Proceedings of the 20th symposium on Great lakes symposium on VLSI}, author={Moezzi-Madani, N. and Thorolfsson, T. and Davis, W.R.}, year={2010}, month={May}, pages={471–476} } @inproceedings{franzon_davis_thorolfsson_melamed_2010, title={Creating 3D Specific Systems: Architecture, Design and CAD}, volume={2010}, DOI={10.4071/isom-2010-TA1-Paper4}, abstractNote={3D stacking and integration can provide significant system advantages. Following a brief technology review, this abstract explores application drivers, design and CAD for 3D ICs. The main 3D exploitation explored in detail is that of logic on memory. This application is explored in a specific DSP example, showing a 25% power advantage when implemented in 3D compared with 2D. Finally critical areas that need better solutions are explored. These include cost management, design planning, test management, and thermal management.}, number={1}, booktitle={International Symposium on Microelectronics}, author={Franzon, P. and Davis, W.R. and Thorolfsson, T. and Melamed, S.L.}, year={2010}, month={Nov}, pages={23–27} } @inproceedings{franzon_davis_thorolfsson_2010, title={Creating 3D Specific Systems: Architecture, Design, and CAD}, ISBN={9783981080162 9781424470549 9783981080162}, url={http://dx.doi.org/10.1109/date.2010.5457086}, DOI={10.1109/DATE.2010.5457086}, abstractNote={3D stacking and integration can provide system advantages. Following a brief technology review, this abstract explores application drivers, design and CAD for 3D ICs. The main application area explored in detail is that of logic on memory. This application is explored in a specific DSP example. Finally critical areas that need better solutions are explored. These include design planning, test management, and thermal management.}, booktitle={2010 Design, Automation & Test in Europe Conference & Exhibition}, publisher={IEEE}, author={Franzon, P.D. and Davis, W.R. and Thorolfsson, T.}, year={2010}, pages={1684–1688} } @inproceedings{melamed_thorolfsson_srinivasan_cheng_franzon_davis_2010, title={Investigation of tier-swapping to improve the thermal profile of memory-on-logic 3DICs}, author={Melamed, S. and Thorolfsson, T. and Srinivasan, A. and Cheng, E. and Franzon, P. and Davis, W.R.}, year={2010}, month={Oct} } @article{thorolfsson_melamed_davis_franzon_2010, title={Low Power Hypercube Divided Memory FFT Engine Using 3D Integration}, volume={16}, DOI={10.1145/1870109.1870114}, abstractNote={ In this article we demonstrate a floating point FFT processor that leverages both 3D integration and a unique hypercube memory division scheme to reduce the power consumption of a 1024 point FFT down to 4.227 μJ . The hypercube memory division scheme lowers the energy per memory access by 59.2% and increases the total required area by 16.8%. The use of 3D integration reduces the logic power by 5.2%. We describe the tool flow required to realize the 3D implementation and perform a thermal analysis of it. }, number={1}, journal={ACM Transactions on Design Automation of Electronic Systems}, author={Thorolfsson, T. and Melamed, S. and Davis, W.R. and Franzon, P.D.}, year={2010}, month={Nov}, pages={1–25} } @inproceedings{davis_2010, title={Modeling Layout-Dependent Stress Effects: Opportunities for OpenDFM}, author={Davis, W.R.}, year={2010}, month={Jun} } @inproceedings{chen_davis_2010, title={Thermal Adaptive Clock Design for 3D Integrated Circuits}, author={Chen, X. and Davis, W.R.}, year={2010}, month={Sep} } @inproceedings{harris_melamed_luniya_davis_steer_doxsee_obermiller_hawkinson_2010, title={Thermal analysis and verification of a mounted monolithic integrated circuit}, ISSN={["1558-058X"]}, DOI={10.1109/SECON.2010.5453924}, abstractNote={As circuit density increases and high-power applications are facilitated, thermal considerations become paramount a design concern. In this paper, a high power monolithic microwave integrated circuit (MMIC) is modeled by the fREEDA multi-physics simulator and measured for validation. While validation is the crux of any simulation model, it is known that thermal measurements accurate to a high resolution are problematic. As such, the thermal profile of integrated circuits cannot be measured directly with infrared thermal imaging due to unequivalent emissivities of materials. It becomes necessary to use an absorptive ink to approximate a blackbody so that the infrared emissions can be used to infer temperature. The impact and effect of this thermal imaging technique is investigated in this work by comparing measurements with detailed thermal simulations with and without the surface treatment. Thermal analysis uses the finite element method and a reduced-order model based on cuboids with effective thermal conductivities. The end goal is to provide a simulation tool to designers, which can be extended to any project which requires attention to thermal preference.}, booktitle={Proceedings of the IEEE SoutheastCon 2010 (SoutheastCon)}, author={Harris, T.R. and Melamed, S. and Luniya, S. and Davis, W.R. and Steer, M.B. and Doxsee, L.E. and Obermiller, K. and Hawkinson, C.}, year={2010}, pages={37–40} } @article{davis_oh_sule_franzon_2009, title={Application Exploration for 3-D Integrated Circuits: TCAM, FIFO, and FFT Case Studies}, volume={17}, ISSN={["1557-9999"]}, DOI={10.1109/TVLSI.2008.2009352}, abstractNote={3-D stacking and integration can provide system advantages. This paper explores application drivers and computer-aided design (CAD) for 3-D integrated circuits (ICs). Interconnect-rich applications especially benefit, sometimes up to the equivalent of two technology nodes. This paper presents physical-design case studies of ternary content-addressable memories (TCAMs), first-in first-out (FIFO) memories, and a 8192-point fast Fourier transform (FFT) processor in order to quantify the benefit of the through-silicon vias in an available 180-nm 3-D process. The TCAM shows a 23% power reduction and the FFT shows a 22% reduction in cycle-time, coupled with an 18% reduction in energy per transform.}, number={4}, journal={IEEE TRANSACTIONS ON VERY LARGE SCALE INTEGRATION (VLSI) SYSTEMS}, author={Davis, W. Rhett and Oh, Eun Chu and Sule, Ambarish M. and Franzon, Paul D.}, year={2009}, month={Apr}, pages={496–506} } @article{hourani_jenkal_davis_alexander_2009, title={Automated Design Space Exploration for DSP Applications}, volume={56}, ISSN={["1939-8115"]}, DOI={10.1007/s11265-008-0226-2}, number={2-3}, journal={JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNAL IMAGE AND VIDEO TECHNOLOGY}, author={Hourani, Ramsey and Jenkal, Ravi and Davis, W. Rhett and Alexander, Winser}, year={2009}, month={Sep}, pages={199–216} } @inproceedings{chen_davis_2009, title={Delay analysis and design exploration for 3D SRAM}, DOI={10.1109/3dic.2009.5306558}, abstractNote={The emerging three-dimension (3D) integration technology provides a solution to reduce delay in SRAM. In this paper, we present a physical based delay analysis approach to explore 3D SRAM design options. Our analysis can be used to optimize the 3D SRAM timing performance at both sub-array and system level. Design examples based on the MITLL 3D process are constructed to demonstrate the trade-offs. As the analysis results show, the optimized 3D sub-array provides up to 20% extra improvement for SRAM access time reduction.}, booktitle={2009 IEEE International Conference on 3d Systems Integration}, author={Chen, X. and Davis, W. R.}, year={2009}, pages={244–247} } @inproceedings{moezzi-madani_davis_2009, title={High-Throughput Low-Complexity MIMO Detector Based on K-Best Algorithm}, booktitle={GLSVLSI '09: Proceedings of the 19th ACM Great Lakes symposium on VLSI}, author={Moezzi-Madani, N. and Davis, W.R.}, year={2009}, pages={451–456} } @inproceedings{melamed_thorolfsson_srinivasan_cheng_franzon_davis_2009, title={Junction-level thermal extraction and simulation of 3DICs}, DOI={10.1109/3DIC.2009.5306529}, abstractNote={In 3DICs heat dissipating devices are stacked directly on top of each other leading to a higher heat density than in a comparable 2D chip. 3D integration also moves the majority of active devices further away from the heatsink. This results in a degraded thermal path which makes it more challenging to remove heat from the active devices. Gradient FireBolt was used to perform an appropriate 3D thermal analysis on a 1024-point, memory-on-logic 3DIC FFT processor for synthetic aperture radar (SAR). The chip was simulated with a spatial resolution of 80 nm, and was modeled to include the effect of each line of interconnect, as well as each via and fill structure exactly as drawn in the layout. Large isolated temperature spikes were found near groups of clock buffers at the edge of the SRAMs on the middle tier. It was found that lowering the simulation resolution and using composite thermal conductivities failed to accurately predict the location of these tentpoles.}, booktitle={2009 IEEE International Conference on 3d Systems Integration}, author={Melamed, S. and Thorolfsson, T. and Srinivasan, A. and Cheng, E. and Franzon, P. and Davis, R.}, year={2009}, pages={395–401} } @article{moezzi-madani_davis_2009, title={Parallel merge algorithm for high-throughput signal processing applications}, volume={45}, ISSN={["1350-911X"]}, DOI={10.1049/el:20092616}, abstractNote={A parallel merge algorithm is proposed that results in a smaller critical-path delay than all of the other merge algorithms explored. The parallel merge circuit effectively increases the throughput of VLSI signal processing systems. The critical path of this circuit is independent of the number of input values.}, number={3}, journal={ELECTRONICS LETTERS}, author={Moezzi-Madani, N. and Davis, W. R.}, year={2009}, month={Jan}, pages={188–189} } @misc{davis_2009, title={Prototyping in 3D-ICs: Design Flow Needs}, author={Davis, W.R.}, year={2009}, month={Oct} } @misc{mineo_davis_2009, title={Save Your Energy: A Fast and Accurate Approach to NoC Power Estimation}, author={Mineo, C. and Davis, W.R.}, year={2009}, month={Feb} } @misc{davis_2009, title={The Benefits of 3D Networks-on-Chip}, author={Davis, W.R.}, year={2009}, month={Dec} } @inproceedings{mineo_davis_2009, title={The Benefits of 3D networks-on-chip as shown with LDPC decoding}, DOI={10.1109/3dic.2009.5306585}, abstractNote={In this work we describe our network-on-chip (NoC) simulator, which fills the gap between architectural level and circuit level NoC simulation. The core is a fast, high level transaction-based NoC simulator, which accesses carefully compiled power, timing, and area models for basic NoC componets built from detailed circuit simulation. It makes use of the architectural evaluator, which performs a detailed global interconnect analysis within the framework of industry-standard design tools. Using low density parity check decoding as a test vehicle, the NoC simulator is used in an NoC design study comparing 2D and 3D integrated circuits, and shows a method by which on-chip networks can be optimized.}, booktitle={2009 IEEE International Conference on 3d Systems Integration}, author={Mineo, C. and Davis, W. R.}, year={2009}, pages={89–96} } @inproceedings{davis_sule_franzon_2008, title={An 8192-point Fast Fourier Transform 3D-IC Case Study}, ISBN={9781424421664}, url={http://dx.doi.org/10.1109/mwscas.2008.4616830}, DOI={10.1109/MWSCAS.2008.4616830}, abstractNote={3D stacking and integration can provide system advantages. This paper explores an application driver for 3D ICs. Interconnect-rich applications especially benefit, sometimes up to the equivalent of two technology nodes. Another promising application area is that of logic-on-memory. This paper presents a case studies of an 8192-point fast Fourier transform (FFT) processor in order to quantify the benefit of the through-silicon vias in an available 180 nm 3D process. The FFT shows a 22% reduction in cycle-time, coupled with an 18% reduction in energy per transform.}, booktitle={2008 51st Midwest Symposium on Circuits and Systems}, publisher={IEEE}, author={Davis, W.R. and Sule, A.M. and Franzon, P.D.}, year={2008}, pages={438–441} } @misc{davis_2008, title={An Architecture Evaluator for Three-Dimensional Integrated Circuits}, author={Davis, W.R.}, year={2008}, month={May} } @inproceedings{hourani_dalal_davis_doss_alexander_2008, title={An Efficient VLSI Implementation for the 1D Convolutional Discreet Wavelet Transform}, DOI={10.1109/MWSCAS.2008.4616938}, abstractNote={This paper presents an efficient implementation of a convolution-based 1D discrete wavelet transform (DWT). The proposed architecture combines several optimizations that improve the performance of the hardware design in terms of throughput and power dissipation. We designed and analyzed the performance of numerous DWT architectures using pertinent metrics and cost functions that assess the impact of the design optimizations. We synthesized our VLSI architectures using a 0.18 mu standard cell library. The final VLSI design combines polyphase decimated FIR filters to reduce power dissipation, pipelined computational cells for higher throughput, and data-interleaving for lower chip area. An analytical comparison with other existing DWT implementations illustrates a two fold improvement in throughput for the proposed architecture.}, booktitle={2008 51st Midwest Symposium on Circuits and Systems}, author={Hourani, R. and Dalal, I. and Davis, W.R. and Doss, C. and Alexander, W.}, year={2008} } @misc{davis_2008, title={Automation and Back-End Design within the FreePDK OpenAccess 45nm PDK and Cell Libraries for University Flows}, author={Davis, W.R.}, year={2008}, month={Jun} } @inproceedings{franzon_davis_steer_hua_lipa_luniya_mineo_oh_sule_thorolfsson_et al._2008, title={Computer-Aided Design and Application Exploration for 3D Integrated Circuits}, booktitle={Proceedings of the Government Microcircuit Applications & Critical Technology (GOMACTech) Conference}, author={Franzon, P.D. and Davis, W.R. and Steer, M.B. and Hua, H. and Lipa, S. and Luniya, S. and Mineo, C. and Oh, J. and Sule, A. and Thorolfsson, T. and et al.}, year={2008} } @inproceedings{franzon_davis_steer_lipa_oh_thorolfsson_melamed_luniya_doxsee_berkeley_et al._2008, title={Design and CAD for 3D Integrated Circuits}, ISBN={9781605581156}, url={http://dx.doi.org/10.1145/1391469.1391642}, DOI={10.1145/1391469.1391642}, abstractNote={High density through silicon vias (TSV) can be used to build 3DICs that enable unique applications in computing, signal processing and memory intensive systems. This paper presents several case studies that are uniquely enhanced through 3D implementation, including a 3D CAM, an FFT processor, and a SAR processor. The CAD flow used to implement for these designs is described. 3DIC requires higher fidelity thermal modeling than 2DIC design. The rationale for this requirement is established and a possible solution is presented.}, booktitle={DAC '08: Proceedings of the 45th annual Design Automation Conference}, publisher={ACM Press}, author={Franzon, P.D. and Davis, W.R. and Steer, M.B. and Lipa, S. and Oh, E.C. and Thorolfsson, T. and Melamed, S. and Luniya, S. and Doxsee, T. and Berkeley, S. and et al.}, year={2008}, month={Jun}, pages={668–673} } @misc{davis_2008, title={FreePDK: A Free OpenAccess 45nm PDK and Cell Library for Universities}, author={Davis, W.R.}, year={2008}, month={Oct} } @inproceedings{mineo_jenkal_melamed_davis_2008, title={Inter-Die Signaling in Three Dimensional Integrated Circuits}, DOI={10.1109/CICC.2008.4672171}, abstractNote={This work discusses a three dimensional network on chip (3D NoC) fabricated in the 0.18 mum MIT Lincoln Laboratories 3D FDSOI 1.5 V process. As a proof of concept, a three tier, 27 node, NoC test chip occupying 4 mm2 per tier was designed and tested. It is the first of its kind to demonstrate successful inter-tier signaling in a complex three dimensional design, and validates the technology as a viable alternative to the continued scaling of conventional CMOS processes. Simulated results show that when implemented in this 3D process, simple 3D mesh interconnection networks allow for the sharing of global routing resources for complex systems while consuming an extremely low 2 mW of power per transaction. Using these results, we establish the need for a 3D network simulator to quantify the advantage 3D circuit implementations have over 2D.}, booktitle={2008 IEEE Custom Integrated Circuits Conference}, author={Mineo, C. and Jenkal, R. and Melamed, S. and Davis, W.R.}, year={2008} } @article{davis_sule_schoenfliess_2007, title={3D Interconnect Device Design: Theory vs. Reality}, number={23}, journal={Future Fab International}, author={Davis, W.R. and Sule, A.M. and Schoenfliess, K.M.}, year={2007}, pages={38–40} } @misc{davis_2007, title={3D-IC Design: Theory vs. Reality}, author={Davis, W.R.}, year={2007}, month={Jun} } @inproceedings{jenkal_davis_2007, title={An Architecture for Energy Efficient Sphere Decoding}, DOI={10.1145/1283780.1283833}, abstractNote={Sphere decoding has become a popular implementation of MIMO detection due to its improved performance at lower hardware complexity. ASIC implementations have proven the feasibility of this method but fail to effectively address the issue of power efficiency. In this work, we propose an improved architecture that aims to exploit a combination of a deeper pipeline and the use of single-port read and write memories to increase the energy efficiency (bits/sec/mW) of the implementation. We see a 30% and 80% increase in memory and logic energy efficiencies when compared to an unpipelined version of the implementation in 0.18 mu technology.}, booktitle={ISLPED '07: Proceedings of the 2007 international symposium on Low power electronics and design}, author={Jenkal, R.S. and Davis, W.R.}, year={2007}, pages={244–249} } @inproceedings{franzon_davis_steer_hao_lipa_luniya_mineo_oh_sule_thorolfsson_et al._2007, title={Design for 3D Integration and Applications}, ISBN={1424414482 1424414490}, url={http://dx.doi.org/10.1109/issse.2007.4294463}, DOI={10.1109/ISSSE.2007.4294463}, abstractNote={3D stacking and integration can provide system advantages equivalent to up to two technology nodes of scaling. This paper explores application drivers and computer aided design (CAD) for 3D ICs.}, booktitle={2007 International Symposium on Signals, Systems and Electronics}, publisher={IEEE}, author={Franzon, Paul and Davis, William Rhett and Steer, Michael B. and Hao, Hua and Lipa, Steven and Luniya, Sonali and Mineo, Christopher and Oh, Julie and Sule, Ambirish and Thorolfsson, Thor and et al.}, year={2007}, pages={263–266,} } @inproceedings{sule_davis_2007, title={Designing FIFO Buffers using 3DIC Technology}, booktitle={VLSI Multilevel Interconnection (VMIC) Conference}, author={Sule, A.M. and Davis, W.R.}, year={2007}, pages={267–272} } @misc{davis_2007, title={Energy-Efficient Sphere Decoding (and other research efforts}, author={Davis, W.R.}, year={2007}, month={Sep} } @inproceedings{davis_2007, title={FreePDK: An Open Source, OpenAccess Design Kit}, author={Davis, W.R.}, year={2007}, month={Nov} } @inproceedings{stine_castellanos_wood_henson_love_davis_franzon_bucher_basavarajaiah_oh_et al._2007, title={FreePDK: An Open-Source Variation-Aware Design Kit}, ISBN={076952849X}, url={http://dx.doi.org/10.1109/mse.2007.44}, DOI={10.1109/MSE.2007.44}, abstractNote={This paper discusses an open source, variation aware Process Design Kit (PDK), based on Scalable CMOS design rules, down to 45 nm,for use in VLSI research, education and small businesses. This kit includes all the necessary layout design rules and extraction command decks to capture layout dependent systematic variation and perform statistical circuit analysis. The kit also includes a standard cell and pad library with the necessary support files to enable full chip place and route and verification for System on Chip designs. Test chips designed with this PDK are designed in such a way so that they can be fabricated by fabrication facilities allowing validation of the design rules so that the rules may be used in future multi-project runs and design contests.}, booktitle={2007 IEEE International Conference on Microelectronic Systems Education (MSE'07)}, publisher={IEEE}, author={Stine, J.E. and Castellanos, I. and Wood, M. and Henson, J. and Love, F. and Davis, W.R. and Franzon, P.D. and Bucher, M. and Basavarajaiah, S. and Oh, J. and et al.}, year={2007} } @inproceedings{jenkal_hua_sule_davis_2006, title={Architecture for Energy Efficient Sphere Decoding}, DOI={10.1109/SOCC.2006.283895}, abstractNote={Sphere Decoding has become a popular implementation of MIMO decoding due to its improved performance at lower hardware complexity. Present ASIC implementations fail to consider sources of pipelinability and parallelism in the algorithm to achieve reduced power. In this work, we provide a proposal and initial results for an improved architecture which aims to increase overall energy efficiency (b/s/mW) of the decoder. This improvement is based on a novel implementation which combines the use of a deeply pipelined data-path and "multi symbol vector" based approach to exploit the pipeline. Implementation in 0.18μ 1.8V CMOS technology provides an operational frequency of 128/230(retimed)MHz at 409 mW(DFF memory)/ 360 mW(realistic memory) and 3.44 sq.mm (DFF memory).}, booktitle={2006 IEEE International SOC Conference}, author={Jenkal, R.S. and Hua, H. and Sule, A. and Davis, W.R.}, year={2006}, pages={267–270} } @inbook{hourani_jenkal_davis_alexander_2006, title={Automated Architectural Exploration for Signal Processing Algorithms}, DOI={10.1109/SIPS.2006.352594}, abstractNote={This paper presents a design environment for efficiently generating application-specific intellectual property (IP) cores for system level signal processing algorithms. We present our view of a framework that combines common electronic design automation (EDA) tools to alleviate the designer from manually constructing the hardware models and analyzing their performance. We use our framework to efficiently implement design optimizations that improve the performance of the overall hardware architectures. Our framework is well suited for designers with a range of signal processing and hardware expertise. Our framework generates the dedicated IP cores and estimates the performance such as area, critical path delay, and latency within seconds. Parts of our framework also compare different hardware designs for various digital signal processing (DSP) algorithms and allows the designer to make architectural decisions earlier in the hardware design process. We use a GUI-based framework invoked from MATLAB to automatically build and analyze the hardware designs. Our framework generates efficient hardware designs described in SystemC and Verilog code, along with the performance metrics for each architecture. We illustrate the use of our framework by exploring and analyzing architectural variations of two case studies: finite impulse response (FIR) filters and adaptive channel equalizers}, booktitle={2006 IEEE Workshop on Signal Processing Systems Design and Implementation}, author={Hourani, R. and Jenkal, R. and Davis, W.R. and Alexander, W.}, year={2006}, pages={274–279,} } @inproceedings{davis_mineo_2006, title={Breaking Rent’s Rule: Opportunities for 3D Interconnect Networks}, booktitle={VLSI Multilevel Interconnection (VMIC) Conference}, author={Davis, W.R. and Mineo, C.}, year={2006}, pages={228–233} } @inproceedings{luniya_batty_caccamesi_garcia_christoffersen_melamed_davis_steer_2006, title={Compact Electrothermal Modeling of an X-band MMIC}, DOI={10.1109/MWSYM.2006.249698}, abstractNote={Compact electrothermal modeling of lumped electrical devices and compact thermal modeling of volumetric materials enables efficient electrothermal modeling of microwave circuits. The compact thermal model of the body of an X-band MMIC is based on analytical solutions of the heat diffusion equation in thermal sub-volumes. The model is accurate and captures thermal nonlinearities. The model considers complex MMIC features such as surface metallization and vias, as well as the mounting configurations including lead-frame, carrier, and printed circuit board. This is coupled with electrothermal models of transistors and of resistors. The models are incorporated in a multi-physics simulator that uses the same model in both transient and harmonic analysis of an X-band LNA MMIC. Simulations are validated with steady-state thermal measurements}, booktitle={2006 IEEE MTT-S International Microwave Symposium Digest}, author={Luniya, S. and Batty, W. and Caccamesi, V. and Garcia, M. and Christoffersen, C. and Melamed, S. and Davis, W.R. and Steer, M.}, year={2006}, month={Jun}, pages={651–654} } @misc{davis_2006, title={Demystifying 3D ICs: The Pros and Cons of Going Vertical}, author={Davis, W.R.}, year={2006}, month={Apr} } @inproceedings{hua_mineo_schoenfliess_sule_melamed_jenkal_davis_2006, title={Exploring Compromises among Timing, Power and Temperature in Three-Dimensional Integrated Circuits}, DOI={10.1145/1146909.1147161}, abstractNote={Three-dimensional integrated circuits (3DICs) have the potential to reduce interconnect lengths and improve digital system performance. However, heat removal is more difficult in 3DICs, and the higher temperatures increase delay and leakage power, potentially negating the performance improvement. Thermal vias can help to remove heat, but they create routing congestion, which also leads to longer interconnects. It is therefore very difficult to tell whether or not a particular system may benefit from 3D integration. In order to help understand this trade-off, physical design experiments were performed on a low-power and a high-performance design in an existing 3DIC technology. Each design was partitioned and routed with varying numbers of tiers and thermal-via densities. A thermal-analysis methodology is developed to predict the final performance. Results show that the lowest energy per operation and delay are achieved with 4 or 5 tiers. These results show a reduction in energy and delay of up to 27% and 20% compared to a traditional 2DIC approach. In addition, it is shown that thermal-vias offer no performance benefit for the low-power system and only marginal benefit for the high-performance system}, booktitle={DAC '06: Proceedings of the 43rd annual Design Automation Conference}, author={Hua, H. and Mineo, C. and Schoenfliess, K. and Sule, A. and Melamed, S. and Jenkal, R. and Davis, W.R.}, year={2006}, month={Jul}, pages={997–1002} } @inproceedings{hua_mineo_schoenfliess_sule_melamed_davis_2006, title={Performance Trend in Three-Dimensional Integrated Circuits}, DOI={10.1109/IITC.2006.1648642}, abstractNote={3DICs are motivated by the expectation of better performance over their 2D counterparts; however, non-idealities threaten to diminish the benefit of multiple tiers. Previous work has predicted the benefit of 3DICs, but have not taken into account the increased temperature and leakage power. This work develops an automated design flow with 2D CAD tools to design 3DICs with the MIT Lincoln Lab 0.18mum three-tier fully depleted silicon on insulator (FDSOI) process (Suntharalingam et al., 2005). This flow uses carefully designed scripts to fill the gap between 2D methodologies and 3D designs. We examine wire-length, timing, clock skew, and total power dissipation, along with temperature, of two benchmark circuits implemented in both 2D and 3D integration. We then extend our observations to the 90nm and 45nm technology nodes with predictive technology model (PTM) and the BSIMSOI model. Experimental results show that the performance of 3DIC, even with the non-idealities, shows up to two-generation advantage over its 2D counterpart with only three tiers}, booktitle={2006 International Interconnect Technology Conference}, author={Hua, H. and Mineo, C. and Schoenfliess, K. and Sule, A. and Melamed, S. and Davis, W.R.}, year={2006}, pages={45–47} } @misc{hourani_jenkal_davis_alexander_2006, title={Tool Integration for Signal Processing Architectural Exploration}, author={Hourani, R. and Jenkal, R. and Davis, R. and Alexander, W.}, year={2006}, month={Apr} } @article{davis_wilson_mick_xu_hua_mineo_sule_steer_franzon_2005, title={Demystifying 3D ICs: The procs and cons of going vertical}, volume={22}, ISSN={["1558-1918"]}, DOI={10.1109/MDT.2005.136}, abstractNote={This article provides a practical introduction to the design trade-offs of the currently available 3D IC technology options. It begins with an overview of techniques, such as wire bonding, microbumps, through vias, and contactless interconnection, comparing them in terms of vertical density and practical limits to their use. We then present a high-level discussion of the pros and cons of 3D technologies, with an analysis relating the number of transistors on a chip to the vertical interconnect density using estimates based on Rent's rule. Next, we provide a more detailed design example of inductively coupled interconnects, with measured results of a system fabricated in a 0.35-/spl mu/m technology and an analysis of misalignment and crosstalk tolerances. Lastly, we present a case study of a fast Fourier transform (FFT) placed and routed in a 0.18-/spl mu/m through-via silicon-on-insulator (SOI) technology, comparing the 3D design to a traditional 2D approach in terms of wire length and critical-path delay.}, number={6}, journal={IEEE DESIGN & TEST OF COMPUTERS}, author={Davis, WR and Wilson, J and Mick, S and Xu, M and Hua, H and Mineo, C and Sule, AM and Steer, M and Franzon, PD}, year={2005}, pages={498–510} } @inproceedings{davis_2005, title={OpenAccess Tools for 3D Integration}, author={Davis, W.R.}, year={2005}, month={Nov} } @inproceedings{hua_sule_mineo_davis_2005, title={Pre-route Net Classing for Crosstalk Avoidance}, author={Hua, H. and Sule, A. and Mineo, C. and Davis, W.R.}, year={2005}, month={Sep} } @inproceedings{davis_hua_sule_mineo_melamed_steer_franzon_2005, title={Wire-Delay Reduction Analysis of a 3-Tier, 8-Point Fast Fourier Transform 3D-IC}, booktitle={VLSI Multilevel Interconnection (VMIC) Conference}, author={Davis, W.R. and Hua, H. and Sule, A. and Mineo, C. and Melamed, S. and Steer, M. and Franzon, P.D.}, year={2005}, month={Oct}, pages={474–479} } @inproceedings{davis_sule_hua_2004, title={Multi-Parameter Power Minimization of Synthesized Datapaths}, DOI={10.1109/ISVLSI.2004.1339523}, abstractNote={As processing technology continues to evolve, power minimization becomes more complex and crucial. Emerging technologies offer an array of different threshold voltages and gate oxide thicknesses. Along with choices of supply-voltage, parallelism, and pipelining, these options complicate the search for energy-optimal architectures. This paper explores the possibility of using convex optimization to solve the multi-parameter optimization problem and presents a case-study of an 8-bit multiply-accumulate block, which is optimized in 250nm and 70nm technologies.}, booktitle={IEEE Computer Society Annual Symposium on VLSI}, author={Davis, W.R. and Sule, A.M. and Hua, H.}, year={2004}, month={Feb}, pages={151–157} } @article{yeo_augsburger_davis_nikolić_2003, title={500 Mb/s Soft Output Viterbi Decoder}, volume={38}, DOI={10.1109/JSSC.2003.813250}, abstractNote={Two 8-state, 7-bit soft output Viterbi decoders matched to an EPR4 channel and a rate-8/9 convolutional code are implemented in 0.18µm CMOS technology. Architectural transformation of the add-compare-select structures and modification of the register exchange allow a high throughput with small area overhead. The 4mm2chip has been verified to decode at 500Mb/s with 1.8V supply. These decoders are used with Turbo codes, which have been demonstrated to achieve information rates very close to the Shannon limit.}, number={7}, journal={IEEE Journal of Solid State Circuits}, author={Yeo, E. and Augsburger, S. and Davis, W.R. and Nikolić, B.}, year={2003}, month={Jul}, pages={1234–1241} } @inproceedings{davis_2003, title={Automated Design Flows for High-Performance Systems}, author={Davis, W.R.}, year={2003}, month={Feb} } @inproceedings{davis_2003, title={Getting High-Performance Silicon from System-Level Design}, DOI={10.1109/ISVLSI.2003.1183482}, abstractNote={System-level design techniques promise a way to lessen the productivity gap between fabrication and design. Unfortunately, these techniques have been slow to catch on, in part because they do little to help designers optimize hardware. This paper presents a brief summary of three system-level design techniques. Platform-based design, SystemC, and Chip-in-a-day, in order to propose that more system-level abstraction of physical performance is needed to make these techniques more useful. An analysis of design-productivity for three chips designed with the Chip-in-a-Day flow is also presented.}, booktitle={IEEE Computer Society Annual Symposium on VLSI, 2003. Proceedings.}, author={Davis, W.R.}, year={2003}, month={Feb}, pages={238–243} } @inbook{kuusilinna_chang_bluethgen_davis_richards_nikolić_brodersen_2003, place={Boston, MA}, title={Real-Time System-on-a-Chip Emulation}, DOI={10.1007/978-1-4615-0369-9_10}, booktitle={Winning the SoC Revolution}, publisher={Springer}, author={Kuusilinna, K. and Chang, C. and Bluethgen, H.M. and Davis, W.R. and Richards, B. and Nikolić, B. and Brodersen, R.W.}, editor={Martin, Grant and Chang, HenryEditors}, year={2003}, month={May}, pages={229–253} } @misc{davis_2003, title={System-Level Design: Past, Present, and Future}, author={Davis, W.R.}, year={2003}, month={Feb} } @inproceedings{yeo_augsburger_davis_nikolić_2002, title={500 Mb/s Soft Output Viterbi Decoder}, booktitle={European Solid-State Circuits Conference (ESSCIRC)}, author={Yeo, E. and Augsburger, S. and Davis, W.R. and Nikolić, B.}, year={2002}, month={Sep}, pages={523–526,} } @article{davis_zhang_camera_marković_smilkstein_ammer_yeo_augsburger_nikolić_brodersen_2002, title={A Design Environment for High-Throughput, Low-Power Dedicated Signal Processing Systems}, volume={37}, DOI={10.1109/4.987095}, abstractNote={A hierarchical automated design flow for low-energy direct-mapped signal processing integrated circuits is presented. A modular framework based on a combined dataflow graph and floorplan description drives automatic layout generation with commercial CAD tools. Automatic characterization of layout improves system-level estimates. Simplified physical design methodologies for low supply voltages are discussed. The flow is demonstrated on a 300-k transistor test-chip, a time-division multiple-access baseband receiver, and a soft-output Viterbi decoder. An example of architectural comparison of energy efficiency is presented.}, number={3}, journal={IEEE Journal of Solid State Circuits}, author={Davis, W.R. and Zhang, N. and Camera, K. and Marković, D. and Smilkstein, T. and Ammer, M.J. and Yeo, E. and Augsburger, S. and Nikolić, B. and Brodersen, R.W.}, year={2002}, month={Mar}, pages={420–431} } @phdthesis{davis_2002, place={Berkeley}, title={A Hierarchical, Automated Design Flow for Low-Power, High-Throughput Digital Signal Processing IC’s}, school={Electrical Engineering Department, University of California}, author={Davis, W.R.}, year={2002} } @inproceedings{yeo_augsburger_davis_nikolić_2002, title={Implementation of high throughput soft output viterbi decoders}, DOI={10.1109/SIPS.2002.1049700}, abstractNote={The architectural considerations for VLSI implementations of soft output Viterbi decoders are presented. Structural transformation of the add-compare-select structures provides high throughput with small area overhead. Modifications to the survivor memory unit and a comparison between the register exchange and memory traceback methods are highlighted. A 4 mm/sup 2/ demonstration chip, consisting of two parallel, 8-state, 7-bit soft output Viterbi decoders, has been implemented in 0.18 /spl mu/m CMOS technology, and decodes at 500 Mb/s with 1.8 V supply. These decoders are used with turbo codes, which have been demonstrated to achieve information rates close to the Shannon limit.}, booktitle={IEEE Workshop on Signal Processing Systems}, author={Yeo, E. and Augsburger, S. and Davis, W.R. and Nikolić, B.}, year={2002}, month={Oct}, pages={146–151} } @inproceedings{davis_zhang_camera_chen_marković_chan_nikolic_brodersen_2001, title={A Design Environment for High Throughput, Low Power Dedicated Signal Processing Systems}, DOI={10.1109/CICC.2001.929839}, abstractNote={A hierarchical automated design flow for low-energy direct-mapped signal processing integrated circuits is presented. A modular framework based on a combined Simulink and floorplan description drives automatic layout generation. Automatic characterization of layout improves system-level estimates. The flow is demonstrated on the subsystems of CDMA and OFDM receivers and a 300 k transistor test-chip.}, booktitle={Proceedings of the IEEE 2001 Custom Integrated Circuits Conference}, author={Davis, W.R. and Zhang, N. and Camera, K. and Chen, F. and Marković, D. and Chan, N. and Nikolic, B. and Brodersen, R.W.}, year={2001}, month={May}, pages={545–548} } @inproceedings{davis_zhang_camera_marković_smilkstein_chan_ammer_yeo_nikolić_brodersen_2001, title={An Automated Design Flow for Low-Power, High-Throughput Dedicated Signal Processing Systems}, DOI={10.1109/ACSSC.2001.986971}, abstractNote={A system-level perspective of a hierarchical automated design flow for low-energy direct-mapped signal processing integrated circuits is presented. Capturing design decisions in a data flow graph allows push-button automation of layout and performance estimation. A detailed example of the design process for a DS SS TDMA baseband receiver is presented.}, booktitle={Conference Record of Thirty-Fifth Asilomar Conference on Signals, Systems and Computers}, author={Davis, W.R. and Zhang, N. and Camera, K. and Marković, D. and Smilkstein, T. and Chan, N. and Ammer, M.J. and Yeo, E. and Nikolić, B. and Brodersen, R.W.}, year={2001}, month={Nov}, pages={475–480} } @misc{davis_2001, title={An Automated Design Flow for Low-Power, High-Throughput Dedicated Signal Processing Systems}, author={Davis, W.R.}, year={2001}, month={Dec} } @misc{davis_2001, title={Design Technology for Low Power Radio Systems}, author={Davis, W.R.}, year={2001}, month={Sep} } @inproceedings{brodersen_davis_yee_zhang_2001, title={Wireless systems-on-a-chip design}, DOI={10.1109/VTSA.2001.934479}, abstractNote={There is a fundamental shift that is occurring in the implementation of wireless systems. Not only is the underlying technology shifting to mainstream CMOS technology, but the applications and specifications of the supported links is also rapidly evolving. These two trends result in radical shifts in the radio system architectures, which ranges from the implementation issues associated with the analog RF circuitry and the digital baseband processing to the basic techniques for dealing with multi-access and the impairments of the channel. All of these design issues are driven by an ever-widening range of requirements from the high bandwidth needs of multimedia Internet access to the ultra low power needs of sensor data networks. The multiple inter-related technologies required for implementation of such wireless system requires a co-design strategy in communication algorithms, protocols, digital architectures as well the analog and digital circuits required for their implementation. A design infrastructure which achieves this is described, which has a particular emphasis on methods for high level specification and estimation, that provides a fully automated chip design flow.}, booktitle={2001 International Symposium on VLSI Technology, Systems, and Applications. Proceedings of Technical Papers}, author={Brodersen, R.W. and Davis, W.R. and Yee, D. and Zhang, N.}, year={2001}, pages={45–48} }