@inbook{bryan_conte_2007, title={Combining cluster sampling with single pass methods for efficient sampling regimen design}, ISBN={9781424412570}, booktitle={2007 IEEE International Conference On Computer Design}, publisher={New York: IEEE}, author={Bryan, P. D. and Conte, T. M.}, year={2007}, pages={472–479} } @article{huiyang_conte_2005, title={Enhancing memory-level parallelism via recovery-free value prediction}, volume={54}, DOI={10.1109/tc.2005.117}, abstractNote={The ever-increasing computational power of contemporary microprocessors reduces the execution time spent on arithmetic computations (i.e., the computations not involving slow memory operations such as cache misses) significantly. Therefore, for memory-intensive workloads, it becomes more important to overlap multiple cache misses than to overlap slow memory operations with other computations. In this paper, we propose a novel technique to parallelize sequential cache misses, thereby increasing memory-level parallelism (MLP). Our idea is based on value prediction, which was proposed originally as an instruction-level parallelism (ILP) optimization to break true data dependencies. In this paper, we advocate value prediction in its capability to enhance MLP instead of ILP. We propose using value prediction and value-speculative execution only for prefetching so that not only the complex prediction validation and misprediction recovery mechanisms are avoided, but better performance can also be achieved for memory-intensive workloads. The minor hardware modifications that are required also enable aggressive memory disambiguation for prefetching. The experimental results show that our technique enhances MLP effectively and achieves significant speedups, even with a simple stride value predictor.}, number={7}, journal={IEEE Transactions on Computers}, author={Huiyang and Conte, T. M.}, year={2005}, pages={897–912} } @article{ozer_conte_2005, title={High-performance and low-cost dual-thread VLIW processor using weld architecture paradigm}, volume={16}, ISSN={["1045-9219"]}, DOI={10.1109/TPDS.2005.150}, abstractNote={This paper presents a cost-effective and high-performance dual-thread VLIW processor model. The dual-thread VLIW processor model is a low-cost subset of the Weld architecture paradigm. It supports one main thread and one speculative thread running simultaneously in a VLIW processor with a register file and a fetch unit per thread along with memory disambiguation hardware for speculative load and store operations. This paper analyzes the performance impact of the dual-thread VLIW processor, which includes analysis of migrating disambiguation hardware for speculative load operations to the compiler and of the sensitivity of the model to the variation of branch misprediction, second-level cache miss penalties, and register file copy time. Up to 34 percent improvement in performance can be attained using the dual-thread VLIW processor when compared to a single-threaded VLIW processor model.}, number={12}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Ozer, E and Conte, TM}, year={2005}, month={Dec}, pages={1132–1142} } @article{mehrotra_rao_conte_franzon_2005, title={Optimal chip-package codesign for high-performance DSP}, volume={28}, ISSN={["1521-3323"]}, DOI={10.1109/TADVP.2005.846937}, abstractNote={In high-performance DSP systems, the memory bandwidth can be improved using high-density interconnect technology and appropriate memory mapping. High-density MCM and flip-chip solder bump technology is used to achieve a system with an I/O bandwidth of 100 Gb/s/cm2 die. The use of DRAMs in these systems usually make the performance of these systems poor, and some algorithms make it difficult to fully utilize the available memory bandwidth. This paper presents the design of a fast Fourier transform (FFT) engine that gives SRAM-like performance in a DRAM-based system. It uses almost 100% of the available burst-mode memory bandwidth. This FFT engine can compute a million-point FFT in 1.31 ms at a sustained computation rate of 8.64 /spl times/ 10/sup 10/ floating-point operations per second (FLOPS). This is at least an order of magnitude better than conventional systems.}, number={2}, journal={IEEE TRANSACTIONS ON ADVANCED PACKAGING}, author={Mehrotra, P and Rao, V and Conte, TM and Franzon, PD}, year={2005}, month={May}, pages={288–297} } @article{bechini_conte_prete_2004, title={Opportunities and challenges in embedded systems}, volume={24}, number={4}, journal={IEEE Micro}, author={Bechini, A. and Conte, T. M. and Prete, C. A.}, year={2004}, pages={38208} } @article{huiyang_toburen_rotenberg_conte_2003, title={Adaptive mode control: A static-power-efficient cache design}, volume={2}, DOI={10.1145/860176.860181}, abstractNote={Lower threshold voltages in deep submicron technologies cause more leakage current, increasing static power dissipation. This trend, combined with the trend of larger/more cache memories dominating die area, has prompted circuit designers to develop SRAM cells with low-leakage operating modes (e.g., sleep mode). Sleep mode reduces static power dissipation, but data stored in a sleeping cell is unreliable or lost. So, at the architecture level, there is interest in exploiting sleep mode to reduce static power dissipation while maintaining high performance.Current approaches dynamically control the operating mode of large groups of cache lines or even individual cache lines. However, the performance monitoring mechanism that controls the percentage of sleep-mode lines, and identifies particular lines for sleep mode, is somewhat arbitrary. There is no way to know what the performance could be with all cache lines active, so arbitrary miss rate targets are set (perhaps on a per-benchmark basis using profile information), and the control mechanism tracks these targets. We propose applying sleep mode only to the data store and not the tag store. By keeping the entire tag store active the hardware knows what the hypothetical miss rate would be if all data lines were active, and the actual miss rate can be made to precisely track it. Simulations show that an average of 73% of I-cache lines and 54% of D-cache lines are put in sleep mode with an average IPC impact of only 1.7%, for 64 KB caches.}, number={3}, journal={ACM Transactions on Embedded Computing Systems}, author={Huiyang and Toburen, M. C. and Rotenberg, E. and Conte, T. M.}, year={2003}, pages={347–372} } @article{fu_bodine_conte_2003, title={Modeling value speculation: An optimal edge selection problem}, volume={52}, ISSN={["0018-9340"]}, DOI={10.1109/TC.2003.1183944}, abstractNote={Techniques for value speculation have been proposed for dynamically scheduled and statically scheduled machines to increase instruction-level parallelism (ILP) by breaking flow (true) dependences and allowing value-dependent operations to be executed speculatively. The effectiveness of value speculation depends upon the ability to select and break dependences to shorten overall execution time, while encountering penalties for value misprediction. To understand and improve the techniques for value speculation, we model value speculation as an optimal edge selection problem. The optimal edge selection problem involves finding a minimal set of edges (dependences) to break in a data dependence graph that achieves maximal benefits from value speculation, while taking the penalties for value misprediction into account. Based on three properties observed from the optimal edge selection problem, an efficient optimal edge selection algorithm is designed. From the experimental results of running the optimal edge selection algorithm for the 20 most heavily executed paths selected from each SPECint95 benchmark, several insights are shown. The average critical path reduction is 9.61 percent on an average and 25.57 percent at its maximum. Surprisingly, 66 percent of the edges selected by the optimal algorithm have value prediction accuracies over 99 percent. Moreover, most of the selected edges cross the middle of the data dependence graph. The selected producer operations thereby tend to reside in the upper portion of the data dependence graph, while the selected consumer operations appear toward the lower portion.}, number={3}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Fu, CY and Bodine, JT and Conte, TM}, year={2003}, month={Mar}, pages={277–292} } @inbook{zhou_jennings_conte_2003, title={Tree Traversal Scheduling: A Global Instruction Scheduling Technique for VLIW/EPIC Processors}, volume={2624}, ISBN={9783540040293 9783540357674}, ISSN={0302-9743}, url={http://dx.doi.org/10.1007/3-540-35767-x_15}, DOI={10.1007/3-540-35767-x_15}, abstractNote={Global scheduling in a treegion framework has been proposed to exploit instruction level parallelism (ILP) at compile time. A treegion is a single-entry / multiple-exit global scheduling scope that consists of basic blocks with control-flow forming a tree. Because a treegion scope is nonlinear (includes multiple paths) it is distinguished from linear scopes such as traces or superblocks. Treegion scheduling has the capability of speeding up all possible paths within the scheduling scope. This paper presents a new global scheduling algorithm using treegions called Tree Traversal Scheduling (TTS). Efficient, incremental data-flow analysis in support of TTS is also presented. Performance results are compared to the scheduling of the linear regions that result from the decomposition of treegions. We refer to these resultant linear regions as linear treegions (LT) and consider them analogous to superblocks with the same amount of code expansion as the base treegion. Experimental results for TTS scheduling show a 35% speedup compared to basic block (BB) scheduling and a 4% speedup compared to LT scheduling.}, booktitle={Languages and Compilers for Parallel Computing}, publisher={Springer Berlin Heidelberg}, author={Zhou, Huiyang and Jennings, Matthew D. and Conte, Thomas M.}, year={2003}, pages={223–238} } @article{conte_2002, title={Choosing the brain(s) of an embedded system}, volume={35}, ISSN={["0018-9162"]}, DOI={10.1109/MC.2002.1016908}, abstractNote={Embedded processors are fundamentally different from desktop processors -costs are too tight for fancy chip sets and expensive packaging. So if you're new to embedded processors, the marketplace is foreign. Worse, so are the design decisions. When you consider purchasing an embedded microprocessor, look carefully at the direct memory access engine.}, number={7}, journal={COMPUTER}, author={Conte, TM}, year={2002}, month={Jul}, pages={106–107} } @inproceedings{huiyang_toburen_rotenberg_conte_2001, title={Adaptive mode control: A static-power-efficient cache design}, ISBN={0769513638}, DOI={10.1109/pact.2001.953288}, abstractNote={Lower threshold voltages in deep sub-micron technologies cause store leakage current, increasing static power dissipation. This trend, combined with the trend of larger/more cache memories dominating die area, has prompted circuit designers to develop SRAM cells with low-leakage operating modes (e.g., sleep mode). Sleep mode reduces static power dissipation but data stored in a sleeping cell is unreliable or lost. So, at the architecture level, there is interest in exploiting sleep mode to reduce static power dissipation while maintaining high performance. Current approaches dynamically control the operating mode of large groups of cache lines or even individual cache lines. However, the performance monitoring mechanism that controls the percentage of sleep-mode lines, and identifies particular lines for sleep mode, is somewhat arbitrary. There is no way to know what the performance could be with all cache lines active, so arbitrary miss rate targets are set (perhaps on a per-benchmark basis using profile information) and the control mechanism tracks these targets. We propose applying sleep mode only to the data store and not the tag store. By keeping the entire tag store active, the hardware knows what the hypothetical miss rate would be if all data lines were active and the actual miss rate can be made to precisely track it. Simulations show an average of 73% of I-cache lines and 54% of D-cache lines are put in sleep mode with an average IPC impact of only 1.7%, for 64KB caches.}, booktitle={2001 International Conference on Parallel Architectures and Compilation Techniques: Proceedings: 8-12 September, 2001, Barcelona, Catalunya, Spain}, publisher={Los Alamitos, CA: IEEE Computer Society}, author={Huiyang and Toburen, M. C. and Rotenberg, E. and Conte, T. M.}, year={2001}, pages={61–70} } @article{conte_sathaye_2000, title={Properties of rescheduling size invariance for dynamic rescheduling-based VLIW cross-generation compatibility}, volume={49}, ISSN={["0018-9340"]}, DOI={10.1109/12.868027}, abstractNote={The object-code compatibility problem in VLIW architectures stems from their statically scheduled nature. Dynamic rescheduling (DR) is a technique to solve the compatibility problem in VLIWs. DR reschedules program code pages at first-time page faults, i.e., when the code pages are accessed for the first time during execution. Treating a page of code as the unit of rescheduling makes it susceptible to the hazards of changes in the page size during the process of rescheduling. This paper shows that the changes in the page size are only due to insertion and/or deletion of NOPs in the code. Further, it presents an ISA encoding, called list encoding, which does not require explicit encoding of the NOPs in the code. Algorithms to perform rescheduling on acyclic code and cyclic code are presented, followed by the discussion of the property of rescheduling-size invariance (RSI) satisfied by list encoding.}, number={8}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Conte, TM and Sathaye, S}, year={2000}, month={Aug}, pages={814–825} } @article{conte_menezes_sathaye_toburen_2000, title={System-level power consumption modeling and tradeoff analysis techniques for superscalar processor design}, volume={8}, ISSN={["1063-8210"]}, DOI={10.1109/92.831433}, abstractNote={This paper presents systematic techniques to find low-power high-performance superscalar processors tailored to specific user applications. The model of power is novel because it separates power into architectural and technology components. The architectural component is found via trace-driven simulation, which also produces performance estimates. An example technology model is presented that estimates the technology component, along with critical delay time and real estate usage. This model is based on case studies of actual designs. It is used to solve an important problem: decreasing power consumption in a superscalar processor without greatly impacting performance. Results are presented from runs using simulated annealing to reduce power consumption subject to performance reduction bounds. The major contributions of this paper are the separation of architectural and technology components of dynamic power the use of trace-driven simulation for architectural power measurement, and the use of a near-optimal search to tailor a processor design to a benchmark.}, number={2}, journal={IEEE TRANSACTIONS ON VERY LARGE SCALE INTEGRATION (VLSI) SYSTEMS}, author={Conte, TM and Menezes, KN and Sathaye, SW and Toburen, MC}, year={2000}, month={Apr}, pages={129–137} } @article{conte_hwu_smotherman_1999, title={30th Annual ACM/IEEE International Symposium on Microarchitecture, Part II - Editors' Introduction}, volume={27}, ISSN={["0885-7458"]}, DOI={10.1023/A:1018739115760}, number={6}, journal={INTERNATIONAL JOURNAL OF PARALLEL PROGRAMMING}, author={Conte, T and Hwu, WM and Smotherman, M}, year={1999}, month={Dec}, pages={425–426} } @article{bose_conte_austin_1999, title={Challenges in processor modeling and validation}, volume={19}, ISSN={["1937-4143"]}, DOI={10.1109/MM.1999.768495}, abstractNote={The methodology for designing state-of-the-art microprocessors involves modeling at various levels of abstraction. In the pre-synthesis phase, this can range from early-stage (microarchitectural) performance-only models to final-stage, detailed register-transfer-level (RTL) models. Hierarchical modeling requires the use of an elaborate validation methodology to ensure inter- and intra-level model integrity. The RTL model, often coded in a hardware description language (e.g. Verilog or VHDL) captures the logical behavior of the entire chip: both in terms of function and cycle-by-cycle pipeline flow timing. It is this model that is subjected to simulation-based architectural validation prior to actual "tape-out" of the processor. The validated RTL specification is used as the source reference model for synthesizing the gate- and circuit-level descriptions of the processor.}, number={3}, journal={IEEE MICRO}, author={Bose, P and Conte, TM and Austin, TM}, year={1999}, pages={9–14} } @article{conte_hwu_smotherman_1999, title={Special issue: 30th Annual ACM/IEEE International Symposium on Microarchitecture, Part I}, volume={27}, ISSN={["0885-7458"]}, DOI={10.1023/A:1018745822603}, number={5}, journal={INTERNATIONAL JOURNAL OF PARALLEL PROGRAMMING}, author={Conte, T and Hwu, WM and Smotherman, M}, year={1999}, month={Oct}, pages={325–326} } @article{conte_hirsch_hwu_1998, title={Combining trace sampling with single pass methods for efficient cache simulation}, volume={47}, ISSN={["0018-9340"]}, DOI={10.1109/12.689650}, abstractNote={The design of the memory hierarchy is crucial to the performance of high performance computer systems. The incorporation of multiple levels of caches into the memory hierarchy is known to increase the performance of high end machines, but the development of architectural prototypes of various memory hierarchy designs is costly and time consuming. In this paper, we will describe a single pass method used in combination with trace sampling techniques to produce a fast and accurate approach for simulating multiple sizes of caches simultaneously.}, number={6}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Conte, TM and Hirsch, MA and Hwu, WMW}, year={1998}, month={Jun}, pages={714–720} } @article{banerjia_sathaye_menezes_conte_1998, title={MPS: Miss-path scheduling for multiple-issue processors}, volume={47}, ISSN={["1557-9956"]}, DOI={10.1109/12.737684}, abstractNote={Many contemporary multiple issue processors employ out-of-order scheduling hardware in the processor pipeline. Such scheduling hardware can yield good performance without relying on compile-time scheduling. The hardware can also schedule around unexpected run-time occurrences such as cache misses. As issue widths increase, however, the complexity of such scheduling hardware increases considerably and can have an impact on the cycle time of the processor. This paper presents the design of a multiple issue processor that uses an alternative approach called miss path scheduling or MPS. Scheduling hardware is removed from the processor pipeline altogether and placed on the path between the instruction cache and the next level of memory. Scheduling is performed at cache miss time as instructions are received from memory. Scheduled blocks of instructions are issued to an aggressively clocked in-order execution core. Details of a hardware scheduler that can perform speculation are outlined and shown to be feasible. Performance results from simulations are presented that highlight the effectiveness of an MPS design.}, number={12}, journal={IEEE TRANSACTIONS ON COMPUTERS}, author={Banerjia, S and Sathaye, SW and Menezes, KN and Conte, TM}, year={1998}, month={Dec}, pages={1382–1397} } @article{bose_conte_1998, title={Performance analysis and its impact on design}, volume={31}, ISSN={["1558-0814"]}, DOI={10.1109/2.675632}, abstractNote={Methods for designing new computer systems have changed rapidly. Consider general purpose microprocessors: gone are the days when one or two expert architects would use hunches, experience, and rules of thumb to determine a processor's features. Marketplace competition has long since forced companies to replace this ad hoc process with a targeted and highly systematic process that focuses new designs on specific workloads. Although the process differs from company to company, there are common elements. The main advantage of a systematic process is that it produces a finely tuned design targeted at a particular market. At its core are models of the processor's performance and its workloads. Developing and verifying these models is the domain now called performance analysis. We cover some of the advances in dealing with modern problems in performance analysis. Our focus is on architectural performance, typically measured in cycles per instruction.}, number={5}, journal={COMPUTER}, author={Bose, P and Conte, TM}, year={1998}, month={May}, pages={41–49} } @article{jennings_conte_1998, title={Subword extensions for video processing on mobile systems}, volume={6}, ISSN={["1092-3063"]}, DOI={10.1109/4434.708250}, abstractNote={Providing video-over-wireless capability to mobile computing platforms results in several interesting challenges. Wireless networks provide less transmission bandwidth than hard wired networks. Because today's wireless local area network technology can provide only around 2 Mbps transmission rates, video compression is essential for transmitting to mobile devices. Due to increased user sensitivity to cost and power consumption, mobile computing platforms prefer a host processor-only solution, opposed to a host processor in conjunction with a digital signal processor. Most general purpose microprocessor architectures have recently extended their instruction set architectures to include parallel instructions for improved performance on multimedia applications, including MPEG (Motion Pictures Expert Group) video. The article highlights the features of several of these extended ISAs for processing MPEG video. Each uses a modified single instruction, multiple data execution model as a technique to enable concurrent execution. In the modified micro SIMD execution model, a single instruction initiates parallel execution on data organized in parallel. The article illustrates the micro SIMD execution of an add instruction. Micro SIMD execution using packed data types (with byte, half word, or word quantities) makes more efficient use of the processor data path for 64 or 128 bit architectures. We refer to this particular form of micro SIMD execution as subword execution.}, number={3}, journal={IEEE CONCURRENCY}, author={Jennings, MD and Conte, TM}, year={1998}, pages={13–16} } @inproceedings{ozer_banerjia_conte_1998, title={Unified assign and schedule: A new approach to scheduling for clustered register file microarchitectures}, booktitle={Proceedings, 31st annual ACM/IEEE International Symposium on Microarchitecture: November 30-December 2, 1998, Dallas, Texas / co-sponsored by ACM SIGMICRO, IEEE Computer Society Technical Committee on Microprogramming and Microarchitecture.}, publisher={Los Alamitos, Calif.: IEEE Computer Society Press}, author={Ozer, E. and Banerjia, S. and Conte, T. M.}, year={1998}, pages={308–315} } @article{fu_jennings_larin_conte_1998, title={Value speculation scheduling for high performance processors}, volume={33}, ISSN={["1558-1160"]}, DOI={10.1145/291006.291058}, abstractNote={Recent research in value prediction shows a surprising amount of predictability for the values produced by register-writing instructions. Several hardware based value predictor designs have been proposed to exploit this predictability by eliminating flow dependencies for highly predictable values. This paper proposed a hardware and software based scheme for value speculation scheduling (VSS). Static VLIW scheduling techniques are used to speculate value dependent instructions by scheduling them above the instructions whose results they are dependent on. Prediction hardware is used to provide value predictions for allowing the execution of speculated instructions to continue. In the case of miss-predicted values, control flow is redirected to patch-up code so that execution can proceed with the correct results. In this paper, experiments in VSS for load operations in the SPECint95 benchmarks are performed. Speedup of up to 17% has been shown for using VSS. Empirical results on the value predictability of loads, based on value profiling data, are also provided.}, number={11}, journal={ACM SIGPLAN NOTICES}, author={Fu, CY and Jennings, MD and Larin, SY and Conte, TM}, year={1998}, month={Nov}, pages={262–271} } @article{schlansker_conte_dehnert_ebcioglu_fang_thompson_1997, title={Compilers for instruction-level parallelism}, volume={30}, ISSN={["0018-9162"]}, DOI={10.1109/2.642817}, abstractNote={Discovering and exploiting instruction level parallelism in code will be key to future increases in microprocessor performance. What technical challenges must compiler writers meet to better use ILP? Instruction level parallelism allows a sequence of instructions derived from a sequential program to be parallelized for execution on multiple pipelined functional units. If industry acceptance is a measure of importance, ILP has blossomed. It now profoundly influences the design of almost all leading edge microprocessors and their compilers. Yet the development of ILP is far from complete, as research continues to find better ways to use more hardware parallelism over a broader class of applications.}, number={12}, journal={COMPUTER}, author={Schlansker, M and Conte, TM and Dehnert, J and Ebcioglu, K and Fang, JZ and Thompson, CL}, year={1997}, month={Dec}, pages={63-&} } @article{conte_sathaye_1997, title={Optimization of VLIW compatibility systems employing dynamic rescheduling}, volume={25}, ISSN={["1573-7640"]}, DOI={10.1007/BF02700048}, number={2}, journal={INTERNATIONAL JOURNAL OF PARALLEL PROGRAMMING}, author={Conte, TM and Sathaye, SW}, year={1997}, month={Apr}, pages={83–112} }