@article{kim_liu_2010, title={A functional unit and register binding algorithm for interconnect reduction}, volume={29}, number={4}, journal={IEEE Transactions on Computer-aided Design of Integrated Circuits and Systems}, author={Kim, T. and Liu, X.}, year={2010}, pages={641–646} } @inproceedings{choi_liu_2010, title={case study: gpu-based implementation of sequence pair based floorplanning using cuda}, booktitle={2010 ieee international symposium on circuits and systems}, author={Choi, W. H. and Liu, X.}, year={2010}, pages={917–920} } @article{yu_liu_2009, title={Implementing Multiphase Resonant Clocking on a Finite-Impulse Response Filter}, volume={17}, ISSN={["1557-9999"]}, DOI={10.1109/TVLSI.2008.2006477}, abstractNote={Rotary clock is a resonant clocking technique that delivers on-chip clock signal distribution with very low power dissipation. Since it can only generate clock signals with multiple phases that are spatially distributed, rotary clock is often considered not applicable to industrial very large scale integration (VLSI) designs. This paper presents the first rotary-clock-based nontrivial digital circuit. Our design, a low-power and high-speed finite-impulse response (FIR) filter, is fully digital and generated using CMOS standard cells in 0.18 mum technology. We have shown that the proposed FIR filter is seamlessly integrated with the rotary clock technique. It uses the spatially distributed multiple clock phases of rotary clock and achieves high power savings. Simulation results demonstrate that our rotary-clock-based FIR filter can operate successfully at 610 MHz, providing a throughput of 39 Gb/s. In comparison with the conventional clock-tree-based design, our design achieves a 34.6% clocking power saving and a 12.8% overall circuit power saving. In addition, the peak current consumed by the rotary-clock-based filter is substantially lower by 40% on the average. Our study makes the crucial step toward the application of rotary clock technique to a broad range of VLSI designs.}, number={11}, journal={IEEE TRANSACTIONS ON VERY LARGE SCALE INTEGRATION (VLSI) SYSTEMS}, author={Yu, Zhengtao and Liu, Xun}, year={2009}, month={Nov}, pages={1593–1601} } @article{yu_liu_2007, title={Low-power rotary clock array design}, volume={15}, ISSN={["1557-9999"]}, DOI={10.1109/TVLSI.2006.887804}, abstractNote={Rotary clock is a recently proposed clock distribution technique based on wave propagation in transmission lines. In this paper, we investigate the problem of power minimization of rotary clock designs. Specifically, we have developed a software tool based on the method of partial element equivalent circuit that is capable of extracting the SPICE netlist from the layout specification of a rotary clock design. Using our tool, we have performed extensive analysis that links various design parameters of a rotary clock design to its oscillation frequency and power dissipation. Based on the results of our analysis, we then propose a power minimization algorithm. Our algorithm derives a rotary clock structure that dissipates the minimal power while satisfying the clock dimension requirement and oscillating at the target frequency with the given clock load. Experimental results have demonstrated that, for target operating frequencies ranging from 0.5 to 5 GHz, rotary clock designs can achieve power savings of up to 80% in comparison with conventional clock tree implementations}, number={1}, journal={IEEE TRANSACTIONS ON VERY LARGE SCALE INTEGRATION (VLSI) SYSTEMS}, author={Yu, Zhengtao and Liu, Xun}, year={2007}, month={Jan}, pages={5–12} } @article{peng_liu_2006, title={An efficient low-power repeater-insertion scheme}, volume={25}, ISSN={["1937-4151"]}, DOI={10.1109/TCAD.2006.882601}, abstractNote={Previous repeater-insertion algorithms for power minimization of realistic interconnect trees under given timing budgets are often time consuming. In this paper, the problem of runtime reduction for low-power repeater insertion is investigated. Specifically, a power-sensitivity analysis that links the algorithm runtime and the power dissipation result to the selection of repeater library and candidate repeater locations is performed. Based on the analysis, possible repeater locations and potential repeater widths are selected to increase the efficiency of the low-power repeater-insertion algorithm, achieving a judicious tradeoff between runtime and power savings. Moreover, a novel repeater-insertion algorithm based on the Lagrangian relaxation framework is proposed. The proposed algorithm combines a local optimizer based on the dynamic programming (DP) technique and a fast global search engine using the "ellipsoid method." As a result, the proposed approach is capable of producing high-quality solutions at a very fast speed and without manual tuning of the algorithm parameters. A repeater-insertion tool called Freeze, which uses the proposed algorithm, is developed and applied to various interconnect trees with different timing targets. Experimental results demonstrate the high effectiveness of the proposed approach. In comparison with the state-of-the-art low-power repeater-insertion schemes, Freeze requires 5.8 times fewer iterations on the average, achieving a speedup of up to 9.1 times with even better power savings. When compared with a DP-based scheme, which guarantees the optimal solution, the proposed tool delivers a speedup of up to 14.6 times with less than 2% power increase on the average}, number={12}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Peng, Yuantao and Liu, Xun}, year={2006}, month={Dec}, pages={2726–2736} } @article{liu_papaefthymiou_2005, title={HyPE: Hybrid power estimation for IP-based systems-on-chip}, volume={24}, ISSN={["1937-4151"]}, DOI={10.1109/TCAD.2005.850891}, abstractNote={In this paper, we present a novel power estimation scheme for programmable systems consisting of predesigned datapath and memory components. The proposed hybrid methodology yields highly accurate estimates within short runtimes by combining high-level simulation with analytical macromodeling of circuit characteristics. The kernel of our methodology is a simulation-free power estimation scheme for memoryless datapaths comprising several IP blocks connected in fixed topologies. The outer shell of our hybrid scheme is a functional simulation, which is performed only on the interfaces between memoryless components and memory blocks. This simulation accurately captures the control signals that affect the flow of data and, consequently, the utilization and power dissipation of hardware. Experimental results validate the accuracy and efficiency of our methodology. We applied our static power estimation kernel to signal processing and data encryption datapaths. For designs of up to 576 IP blocks, the average error of our power estimates is 7.3% in comparison with switch-level simulation results. We implemented our hybrid scheme into a power estimation tool, called HYPE, and used it to explore various architectural alternatives in the design of a 256-state Viterbi decoder and a Rijndael encryptor. For designs with about 1 million transistors, our estimator terminates within seconds. Compared with commercial state-of-the-art gate-level power estimators, our proposed methodology is up to 1000 times faster with 5.4% deviation on average.}, number={7}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Liu, X and Papaefthymiou, MC}, year={2005}, month={Jul}, pages={1089–1103} } @article{liu_peng_papaefthyrniou_2005, title={Practical repeater insertion for low power: What repeater library do we need?}, volume={25}, ISSN={["1937-4151"]}, DOI={10.1109/TCAD.2005.855968}, number={5}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Liu, X and Peng, YT and Papaefthyrniou, MC}, year={2005}, month={May}, pages={917–924} } @article{liu_papaefthymiou_2004, title={A Markov chain sequence generator for power macromodeling}, volume={23}, ISSN={["1937-4151"]}, DOI={10.1109/TCAD.2004.829819}, abstractNote={In this paper, we present a novel sequence generator based on a Markov chain (MC) model. Specifically, we formulate the problem of generating a sequence of vectors with given average input probability p, average transition density d, and spatial correlation s as a transition matrix computation problem, in which the matrix elements are subject to constraints derived from the specified statistics. We also give a practical heuristic that computes such a matrix and generates a sequence of l n-bit vectors in O(nl+n/sup 2/) time. Derived from a strongly mixing MC, our generator yields binary vector sequences with accurate statistics, high uniformity, and high randomness. Experimental results show that our sequence generator can cover more than 99% of the parameter space. Sequences of 2000 48-bit vectors are generated in less than 0.05 s, with average deviations of the signal statistics p,d, and s equal to 1.6%, 1.8%, and 2.8%, respectively. Our generator enables the detailed study of power macromodeling. Using our tool and the ISCAS'85 benchmark circuits, we have assessed the sensitivity of power dissipation to the three input statistics p,d, and s. Our investigation reveals that power is most sensitive to transition density, while only occasionally exhibiting high sensitivity to signal probability and spatial correlation. Our experiments also show that input signal imbalance can cause estimation errors as high as 100% in extreme cases, although errors are usually within 25%.}, number={7}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Liu, X and Papaefthymiou, MC}, year={2004}, month={Jul}, pages={1048–1062} } @article{liu_papaefthymiou_2003, title={Design of a 20-Mb/s 256-state Viterbi decoder}, volume={11}, ISSN={["1557-9999"]}, DOI={10.1109/TVLSI.2003.817547}, abstractNote={The design of high-throughput large-state Viterbi decoders relies on the use of multiple arithmetic units. The global communication channels among these parallel processors often consist of long interconnect wires, resulting in large area and high power consumption. In this paper, we propose a data transfer oriented design methodology to implement a low-power 256-state rate-1/3 Viterbi decoder. Our architectural level scheme uses operation partitioning, packing, and scheduling to analyze and optimize interconnect effects in early design stages. In comparison with other published Viterbi decoders, our approach reduces the global data transfers by up to 75% and decreases the amount of global buses by up to 48%, while enabling the use of deeply pipelined datapaths with no data forwarding. In the register-transfer level (RTL) implementation, we apply precomputation in conjunction with saturation arithmetic to further reduce power dissipation with provably no coding performance degradation. Designed using a 0.25 /spl mu/m standard cell library, our decoder achieves a throughput of 20 Mb/s in simulation and dissipates only 0.45 W.}, number={6}, journal={IEEE TRANSACTIONS ON VERY LARGE SCALE INTEGRATION (VLSI) SYSTEMS}, author={Liu, X and Papaefthymiou, MC}, year={2003}, month={Dec}, pages={965–975} }