@article{nagabhiru_byrd_2024, title={Achieving Forward Progress Guarantee in Small Hardware Transactions}, volume={23}, ISSN={["1556-6064"]}, url={https://doi.org/10.1109/LCA.2024.3370992}, DOI={10.1109/LCA.2024.3370992}, abstractNote={Hardware-transactional-memory (HTM) manages to pique interest from academia and industry alike because of its potential to ease concurrent-programming without compromising on performance. It offers a simple “all-or-nothing” idea to the programmer, making a piece of code appear atomic in hardware. Despite this and many elegant HTM implementations in research, only best-effort HTM is available commercially. Best-effort HTM lacks forward progress guarantee making it harder for the programmer to create a concurrent scalable fallback path. This has made HTM's adaptability limited. With a scope to support a myriad of applications, HTMs do a trade off between design and verification complexity vs forward progress guarantee. In this letter, we argue that limiting the scope of applications helps HTM attain guaranteed forward progress. We support lock-free programs by using HTM as multi-word-atomics and demonstrate strategic design choices to achieve lock-freedom completely in hardware. We use lfbench, a lock-free micro-benchmark-suite, and Arm's best-effort HTM (ARM_TME) on the gem5 simulator, as our base. We demonstrate the performance tradeoffs between design choices of a deferral-based, NACK-based, and NACK-with-backoff approaches. We show that NACK-with-backoff performs better than the others without compromising scalability for both read- and write-intensive applications.}, number={1}, journal={IEEE COMPUTER ARCHITECTURE LETTERS}, author={Nagabhiru, Mahita and Byrd, Gregory T.}, year={2024}, month={Jan}, pages={53–56} } @inproceedings{ibrahim_bronn_byrd_2023, title={Crosstalk-Based Parameterized Quantum Circuit Approximation}, DOI={10.1109/QCE57702.2023.00014}, abstractNote={In this paper, we propose an ansatz approximation approach for variational quantum algorithms (VQAs) that uses one of the hardware's main attributes, its crosstalk behavior, as its main approximation driver. By utilizing crosstalk-adaptive scheduling, we are able to apply a circuit-level approximation/optimization to our ansatz. Our design procedure involves first characterizing the hardware's crosstalk and then approximating the circuit by a desired level of crosstalk mitigation, all while effectively reducing its duration and gate counts. We demonstrate the effect of crosstalk mitigation on expressibility, trainability, and entanglement: key components that drive the utility of parameterized circuits. We tested our approach on real quantum hardware against a base configuration, and our results showed superior performance for the circuit-level optimized ansatz over a base ansatz for two quantum chemistry benchmarks. We take into consideration that applications vary in their response to crosstalk, and we believe that this approximation strategy can be used to create ansatze that are expressive, trainable, and with crosstalk mitigation levels tailored for specific workloads.}, booktitle={IEEE International Conference on Quantum Computing and Engineering}, author={Ibrahim, Mohannad and Bronn, Nicholas T. and Byrd, Gregory T.}, editor={Bronn, Nicholas T. and Byrd, GregoryEditors}, year={2023}, pages={39–50} } @article{ahmad_earnest-noble_byrd_2023, title={Exploring Architecture of Qiskit Runtime for Educational Enablement}, DOI={10.1109/QCE57702.2023.20331}, abstractNote={A multitude of programming languages and frame-works exist for quantum computing today (Qiskit, Intel Quantum SDK, Pennylane, Cirq, Microsoft Quantum SDK, Yao.jl, etc.). Simultaneously, a given language or framework plays a vital role in a particular area of STEM education, and focuses on specialised features that can benefit specific domain(s) of computing & science. Accessing quantum computing compute resources from a software framework requires the development of integrations for every hardware platform, which becomes a challenge in the long run. The Qiskit Runtime environment is composed of Primitives: Sampler/Estimator, Sessions, and a rich set of Error mitigation techniques and compiler optimization methods. In this paper, we show how the recent architecture of Qiskit Runtime environment, now accepting QASM strings into REST API calls, starts to bridge the gap between programming languages & quantum computing frameworks. More specifically, we highlight the students' needs of interfacing between languages & frameworks, along with the needed development for an individual to generate accepted inputs which complete a quantum program.}, journal={2023 IEEE INTERNATIONAL CONFERENCE ON QUANTUM COMPUTING AND ENGINEERING, QCE}, author={Ahmad, Syed Farhan and Earnest-Noble, Nate and Byrd, Gregory T.}, editor={Byrd, Gregory and Earnest-Noble, NateEditors}, year={2023}, pages={112–118} } @article{elnawawy_tuck_byrd_2023, title={PreFlush: Lightweight Hardware Prediction Mechanism for Cache Line Flush and Writeback}, ISSN={["1089-795X"]}, DOI={10.1109/PACT58117.2023.00015}, abstractNote={Non-Volatile Main Memory (NVMM) technologies make it possible for applications to permanently store data in memory. To do so, they need to make sure that updates to persistent data comply with the crash consistency model, which often involves explicitly flushing a dirty cache line after a store and then waiting for the flush operation to complete using a store fence. While cache line flush and write back instructions can complete in the background, fence instructions expose the latency of flushing to the critical path of the program's execution, incurring significant overheads. If flush operations are started earlier, the penalty of fences can be significantly reduced. We propose PreFlush, a lightweight and transparent hardware mechanism that predicts when a cache line flush or write back is needed and speculatively performs the operation early. Since we speculatively perform the flush, we add hardware to handle flush misspeculation to ensure correct execution of the code without the need for any complex recovery mechanisms. Our PreFlush design is transparent to the programmer (i.e. it requires no modification on existing NVMM-enabled code). Our results show that PreFlush can improve performance by up to 25% (15.7% average) for the WHISPER NVM benchmark suite and loop-based matrix microbenchmarks.}, journal={2023 32ND INTERNATIONAL CONFERENCE ON PARALLEL ARCHITECTURES AND COMPILATION TECHNIQUES, PACT}, author={Elnawawy, Hussein and Tuck, James and Byrd, Gregory T.}, editor={Tuck, James and Byrd, GregoryEditors}, year={2023}, pages={74–85} } @article{byrd_ding_2023, title={Quantum Computing: Progress and Innovation}, volume={56}, ISSN={["1558-0814"]}, url={https://doi.org/10.1109/MC.2022.3217021}, DOI={10.1109/MC.2022.3217021}, abstractNote={The field of quantum computers has reached the point of having real systems available for experimentation to a broad community. In this article, we review recent developments in hardware technology, outline challenges and innovations in quantum systems, and discuss efforts to cultivate and educate a quantum-ready community.}, number={1}, journal={COMPUTER}, author={Byrd, Gregory T. and Ding, Yongshan}, year={2023}, month={Jan}, pages={20–29} } @article{nagabhiru_byrd_2023, title={lfbench: a lock-free microbenchmark suite}, DOI={10.1109/ISPASS57527.2023.00040}, abstractNote={In this work, we present lfbench: a microbenchmark suite intended as a one-stop shop representing all the popular lock-free data structures. Lock-free programming is very complex and so hard that there hasn’t been a generalized lockfree algorithm designed; instead, lock-free data structures are individually developed and optimized for the specific use-cases. In spite of this difficulty, lock-free programs are indispensable; OS kernel codes, popular databases, networking buffers, and so forth, all rely on lock-free data structures for the performance and scalability they provide. We attempt for the first time to bring all the popular lock-free data structures under one roof, primarily to enable development of new WW semantics needed for easy lock-free programming and help evaluate the same. Additionally, the benchmark suite can be used for:1)Performance analysis of any new S/W algorithms/ libraries developed.2)Building blocks for complex multi-threaded applications.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON PERFORMANCE ANALYSIS OF SYSTEMS AND SOFTWARE, ISPASS}, author={Nagabhiru, Mahita and Byrd, Greg}, year={2023}, pages={322–324} } @inproceedings{mughrabi_byrd_2022, title={CAPI-Precis: Towards a Compute-Centric Interface for Coherent Shared Memory Accelerators}, url={http://dx.doi.org/10.1109/icfpt56656.2022.9974504}, DOI={10.1109/icfpt56656.2022.9974504}, abstractNote={Emerging shared memory accelerator interfaces promote a tighter coupling between traditional general-purpose processing cores and accelerator units through cache-coherence and shared virtual address space capabilities. However, different interface standards solving similar problems often require custom designs and optimizations depending on the adopted interface. This work introduces CAPI-Precis, an abstract layer between CAPI, a cache-coherent interface standard proposed by IBM, and the Accelerator Functional Unit (AFU). CAPI-Precis provides a Compute-Centric FIFO-based paradigm with the shared memory accelerator interface, hiding CAPI complexities and latency requirements in an abstract layer focusing on optimized, efficient, and scalable AFUs. Such a layer adapts to other shared memory interfaces, such as CCIX or CXL, with minimal overhead in area and performance while preserving the algorithm logic design.}, booktitle={2022 International Conference on Field-Programmable Technology (ICFPT)}, publisher={IEEE}, author={Mughrabi, Abdullah T. and Byrd, Gregory T.}, year={2022}, month={Dec} } @article{ibrahim_mohammadbagherpoor_rios_bronn_byrd_2022, title={Evaluation of Parameterized Quantum Circuits with Cross-Resonance Pulse-Driven Entanglers}, url={https://doi.org/10.1109/TQE.2022.3231124}, DOI={10.1109/TQE.2022.3231124}, abstractNote={Variational quantum algorithms (VQAs) have emerged as a powerful class of algorithms that is highly suitable for noisy quantum devices. Therefore, investigating their design has become key in quantum computing research. Previous works have shown that choosing an effective parameterized quantum circuit (PQC) or ansatz for a VQA is crucial to its overall performance, especially on near-term devices. In this article, we utilize pulse-level access to quantum machines, our understanding of their two-qubit interactions, and, more importantly, our knowledge of VQAs, to customize the design of two-qubit entanglers. Our analysis shows that utilizing customized pulse gates for ansatze reduces state preparation times by more than half, maintains expressibility relative to standard ansatze, and produces PQCs that are more trainable through local cost function analysis. Our algorithm performance results show that in three cases, our PQC configuration outperforms the base implementation. Experiments using IBM Quantum hardware demonstrate that our pulse-based PQC configurations are more capable of solving MaxCut and chemistry problems compared to a standard configuration.}, journal={IEEE Transactions on Quantum Engineering}, author={Ibrahim, Mohannad M. and Mohammadbagherpoor, Hamed and Rios, Cynthia and Bronn, Nicholas T. and Byrd, Gregory T.}, year={2022} } @book{stancil_byrd_2022, title={Principles of Superconducting Quantum Computers}, journal={Wiley}, publisher={Wiley}, author={Stancil, Daniel D. and Byrd, Gregory}, year={2022} } @article{ibrahim_mohammadbagherpoor_rios_bronn_byrd_2022, title={Pulse-Level Optimization of Parameterized Quantum Circuits for Variational Quantum Algorithms}, url={https://arxiv.org/abs/2211.00350}, DOI={10.48550/ARXIV.2211.00350}, abstractNote={Variational Quantum Algorithms (VQAs) have emerged as a powerful class of algorithms that is highly suitable for noisy quantum devices. Therefore, investigating their design has become key in quantum computing research. Previous works have shown that choosing an effective parameterized quantum circuit (PQC) or ansatz for VQAs is crucial to their overall performance, especially on near-term devices. In this paper, we utilize pulse-level access to quantum machines and our understanding of their two-qubit interactions to optimize the design of two-qubit entanglers in a manner suitable for VQAs. Our analysis results show that pulse-optimized ansatze reduce state preparation times by more than half, maintain expressibility relative to standard PQCs, and are more trainable through local cost function analysis. Our algorithm performance results show that in three cases, our PQC configuration outperforms the base implementation. Our algorithm performance results, executed on IBM Quantum hardware, demonstrate that our pulse-optimized PQC configurations are more capable of solving MaxCut and Chemistry problems compared to a standard configuration.}, publisher={arXiv}, author={Ibrahim, Mohannad and Mohammadbagherpoor, Hamed and Rios, Cynthia and Bronn, Nicholas T. and Byrd, Gregory T.}, year={2022}, month={Nov} } @article{mughrabi_ibrahim_byrd_2021, title={QPR: Quantizing PageRank with Coherent Shared Memory Accelerators}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS49936.2021.00105}, abstractNote={Graph algorithms often require fine-grained, random access across substantially large data structures. Previous work on FPGA-based acceleration has required significant preprocessing and restructuring to transform the memory access patterns into a streaming format that is more friendly to of fchip hardware. However, the emergence of cache-coherent shared memory interfaces, such as CAPI, allows designers to more easily work with the natural in-memory organization of the data. This paper introduces a vertex-centric shared-memory accelerator for the PageRank algorithm, optimized for high performance while effectively using coherent caching on the FPGA hardware. The proposed design achieves up to 14.9x speedups by selectively caching graph data for the accelerator while taking into account locality and reuse, compared to naively using the shared address space access and DRAM only. We also introduce PageRank Quantization, an innovative technique to represent page-ranks with 32-bit quantized fixed-point values. This approach is up to 1.5x faster than 64-bit fixed-point while keeping precision within a tolerable error margin. As a result, we maintain both the hardware scalability of fixed-point representation and the cache performance of 32-bit floating-point.}, journal={2021 IEEE 35TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Mughrabi, Abdullah T. and Ibrahim, Mohannad and Byrd, Gregory T.}, year={2021}, pages={962–972} } @inproceedings{liu_byrd_zhou_2020, place={New York, NY, USA}, title={Quantum Circuits for Dynamic Runtime Assertions in Quantum Computation}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85082381669&partnerID=MN8TOARS}, DOI={10.1145/3373376.3378488}, abstractNote={In this paper, we propose quantum circuits for runtime assertions, which can be used for both software debugging and error detection. Runtime assertion is challenging in quantum computing for two key reasons. First, a quantum bit (qubit) cannot be copied, which is known as the non-cloning theorem. Second, when a qubit is measured, its superposition state collapses into a classical state, losing the inherent parallel information. In this paper, we overcome these challenges with runtime computation through ancilla qubits, which are used to indirectly collect the information of the qubits of interest. We design quantum circuits to assert classical states, entanglement, and superposition states. Our experimental results show that they are effective in debugging as well as improving the success rate for various quantum algorithms on IBM Q quantum computers.}, booktitle={ASPLOS '20: Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems}, publisher={Association for Computing Machinery}, author={Liu, J. and Byrd, G. and Zhou, H.}, year={2020}, pages={1017–1030} } @inproceedings{elnawawy_chowdhury_awad_byrd_2019, title={Diligent TLBs: a mechanism for exploiting heterogeneity in TLB miss behavior}, url={http://dx.doi.org/10.1145/3330345.3330363}, DOI={10.1145/3330345.3330363}, abstractNote={Modern workloads such as graph analytics, sparse matrix multiplication, and in-memory key-value stores use very large datasets and typically have non-uniform memory access patterns which defy traditional concepts of locality. Moreover, many of these algorithms simultaneously use multiple data structures that have very distinct access patterns to the corresponding pages, leading to heterogeneity in TLB behavior. Our intuition suggests that these two factors make it important to architect a heterogeneity-aware TLB hierarchy. Our results confirm the existence of heterogeneity in TLB behavior, where a few pages have high reuse but poor temporal locality. These pages are responsible for a significant percentage of the TLB misses (e.g. over 15% of the TLB misses result from only 17 pages, which is 0.04% of the total number of pages, for Canneal kernel). In this paper, we propose Diligent TLBs (Di-TLBs), a novel hardware-software co-design for TLBs that identifies such delinquent page mappings by tracking their reuse behavior and pinning them in the TLBs to reduce misses. We show that Di-TLBs reduce TLB misses by up to 24.93% on average while improving performance by up to 9.13% on average for a collection of memory-intensive workloads.}, booktitle={Proceedings of the ACM International Conference on Supercomputing}, author={Elnawawy, Hussein and Chowdhury, Rangeen Basu Roy and Awad, Amro and Byrd, Gregory T.}, year={2019}, month={Jun}, pages={195–205} } @article{kumar_singh_byrd_2019, title={Hybrid Remote Access Protocol}, volume={18}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85061059738&partnerID=MN8TOARS}, DOI={10.1109/lca.2019.2896116}, abstractNote={The invalidation-based cache coherence protocols used in current CMPs result in inefficient utilization of cache hierarchy in the presence of heavy sharing, since a significant percentage of shared cached data is invalidated soon after it is brought into the private cache. This work presents an analysis of a shared memory cache coherence protocol; based on novel insights from the analysis, we advocate direct remote reads/writes at the shared last-level cache for heavily contended data. Evaluation of our proposed protocol with the Splash2x kernels shows 17 percent geometric mean speedup over traditional MESI coherence and 8.5 percent better performance than prior remote-access proposals.}, number={1}, journal={IEEE Computer Architecture Letters}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Kumar, Chanchal and Singh, Sidharth and Byrd, Gregory T.}, year={2019}, month={Jan}, pages={30–33} } @article{mueller_byrd_dreher_2019, title={Programming Quantum Computers: A Primer with IBM Q and D-Wave Exercises}, DOI={10.1145/3293883.3302578}, abstractNote={This tutorial provides a hands-on introduction to quantum computing. It will feature the three pillars, architectures, programming, and algorithms/applications of quantum computing. Its focus is on the applicability of problems to quantum computing from a practical point, with only the necessary foundational coverage of the physics and theoretical aspects to understand quantum computing. Simulation software will be utilized complemented by access to actual quantum computers to prototype problem solutions. This should develop a better understanding of how problems are transformed into quantum algorithms and what programming language support is best suited for a given application area. As a first of its kind, to the best of our knowledge, the tutorial includes hands-on programming experience with IBM Q and D-Wave hardware.}, journal={PROCEEDINGS OF THE 24TH SYMPOSIUM ON PRINCIPLES AND PRACTICE OF PARALLEL PROGRAMMING (PPOPP '19)}, author={Mueller, Frank and Byrd, Greg and Dreher, Patrick}, year={2019}, pages={451–451} } @article{zhou_byrd_2019, title={Quantum Circuits for Dynamic Runtime Assertions in Quantum Computation}, volume={18}, ISSN={1556-6056 1556-6064 2473-2575}, url={http://dx.doi.org/10.1109/LCA.2019.2935049}, DOI={10.1109/LCA.2019.2935049}, abstractNote={In this paper, we propose quantum circuits for runtime assertions, which can be used for both software debugging and error detection. Runtime assertion is challenging in quantum computing for two key reasons. First, a quantum bit (qubit) cannot be copied, which is known as the non-cloning theorem. Second, when a qubit is measured, its superposition state collapses into a classical state, losing the inherent parallel information. In this paper, we overcome these challenges with runtime computation through ancilla qubits, which are used to indirectly collect the information of the qubits of interest. We design quantum circuits to assert classical states, entanglement, and superposition states.}, number={2}, journal={IEEE Computer Architecture Letters}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Zhou, Huiyang and Byrd, Gregory T.}, year={2019}, month={Jul}, pages={111–114} } @article{willis_byrd_johnson_2017, title={Challenge-Based Learning}, volume={50}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85024478317&partnerID=MN8TOARS}, DOI={10.1109/mc.2017.216}, abstractNote={Challenges and competitions offer a compelling platform for engaging students and lifelong learners in new technologies and skill development. This special issue explores a sampling of challenge-based approaches to education and community outreach.}, number={7}, journal={Computer}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Willis, Scooter and Byrd, Greg and Johnson, Brian David}, year={2017}, pages={13–16} } @article{snyder_byrd_2017, title={The Internet of Everything}, volume={50}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-85020897168&partnerID=MN8TOARS}, DOI={10.1109/mc.2017.179}, abstractNote={(IoE) demands an intelligent network – a distributed, application-centric networking, computing and storage platform that connects people, processes, data and things in ways that just weren't possible, or even imaginable, in healthcare before.}, number={6}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Snyder, Tom and Byrd, Greg}, year={2017}, month={Jun}, pages={8–9} } @article{weaver_byrd_bryce_2016, title={Computing Tools and Techniques for Emergency Response}, volume={49}, ISSN={["1558-0814"]}, DOI={10.1109/mc.2016.128}, abstractNote={From social networks to autonomous robotics, computing technologies improve our ability to quickly and effectively respond to emergencies. This article includes a sidebar entitled, "Supporting Disaster Volunteers from the Internet," by Dai Sato, which describes the development and use of large-scale online volunteer activities in Japan.}, number={5}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Weaver, Alfred C. and Byrd, Greg and Bryce, Renee}, year={2016}, month={May}, pages={16–18} } @article{byrd_2016, title={Home Sweet Mind-Controlled Home}, volume={49}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84970024073&partnerID=MN8TOARS}, DOI={10.1109/mc.2016.140}, abstractNote={Students at Colorado State University built a virtual reality prototype for experimenting with cognitive control of connected household devices.}, number={5}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2016}, month={May}, pages={98–101} } @article{byrd_2016, title={IEEE/IBM Watson Student Showcase}, volume={49}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84962048157&partnerID=MN8TOARS}, DOI={10.1109/mc.2016.13}, abstractNote={The Miface project is using crowdsourcing to build an extensive database of semantically tagged facial expressions. Developed by students at NYU, the Web application uses IBM Watson's Tone Analyzer module to refine user labels for each expression. The end result will be to enrich the ability of computational agents to understand and generate meaningful nonverbal cues for human interaction. The Web extra at https://youtu.be/kvdlR41M28c is a video of the Miface project, which is using crowdsourcing to build an extensive database of semantically-tagged facial expressions.}, number={1}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2016}, month={Jan}, pages={102–104} } @article{byrd_2016, title={Immortal Bits: Managing Our Digital Legacies}, volume={49}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84963877869&partnerID=MN8TOARS}, DOI={10.1109/mc.2016.71}, abstractNote={An Ulster University student designed a Website to help manage and deliver digital assets after death.}, number={3}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2016}, month={Mar}, pages={100–103} } @article{byrd_2016, title={Let the Sun Shine}, volume={49}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84978712532&partnerID=MN8TOARS}, DOI={10.1109/mc.2016.205}, abstractNote={Clemson University students designed an inexpensive sensor node and cloud computing infrastructure to collect real-time, localized solar irradiation data. This data can be used by consumers and utilities to predict the availability of solar-generated electricity and to manage its use.}, number={7}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2016}, month={Jul}, pages={94–97} } @article{byrd_2016, title={Seeing Is Understanding}, volume={49}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84987630575&partnerID=MN8TOARS}, DOI={10.1109/mc.2016.266}, abstractNote={To help teach object-oriented programming, students at King Abdulaziz University in Saudi Arabia created a self-paced, interactive program that associates code with visual cues to reinforce the concepts of inheritance and polymorphism.}, number={9}, journal={COMPUTER}, author={Byrd, Greg}, year={2016}, month={Sep}, pages={94–97} } @article{byrd_2016, title={Tactile Digital Braille Display}, volume={49}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84997079346&partnerID=MN8TOARS}, DOI={10.1109/mc.2016.327}, abstractNote={Students at North Carolina State University enhanced Polymer Braille's multiline braille display by adding new interactive features, additional rows of characters, and a mobile-device interface.}, number={11}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2016}, month={Nov}, pages={88–90} } @article{byrd_2015, title={21st Century Pong}, volume={48}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84959421337&partnerID=MN8TOARS}, DOI={10.1109/mc.2015.306}, abstractNote={Cornell University students built a system that tracks a Ping-Pong ball in real time and keeps score. The Web extra at https://youtu.be/r7VtgzPPYy4 is a video demonstration in which Cornell students Pol Rosello (CS), Taylor Pritchard (ECE), and Frank Xie (ECE), describe and demonstrate their Table Tennis Tracker system. The system analyzes a video stream to track the location of the ball and to automatically keep score. Video provided by Dr. Bruce Land.}, number={10}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2015}, month={Oct}, pages={80–84} } @article{byrd_2015, title={A Little Ingenuity Solves an Elephant-Sized Problem}, volume={48}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84928143216&partnerID=MN8TOARS}, DOI={10.1109/mc.2015.94}, abstractNote={A student team from NC State has designed a collar to help control wild elephants that threaten human property and life in Africa and Asia. The Web extra at https://youtu.be/aQV_BkOL4vA is a video showing how students from North Carolina State University designed a collar to keep elephants away from farms and villages. In this field test in South Africa, a buzzing sound from the collar causes an elephant to turn away from the protected area. Video by Emma Besaw. The second Web extra at http://youtu.be/ap1dSwCc6fY is a video in which editor Greg Byrd introduces the new Student Design Showcase column, dedicated to innovative, interesting student projects from computer science and engineering.}, number={4}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2015}, month={Apr}, pages={74–77} } @article{byrd_2015, title={Cycling Through Cyberspace}, volume={48}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84939488306&partnerID=MN8TOARS}, DOI={10.1109/mc.2015.220}, abstractNote={Students at the University of Brasília set out to build a better exercise bike. The Web extra at http://youtu.be/NCSb_sDJL7c is a video demonstration of the Bike-X simulator, a virtual cycling experience designed by engineering students at the University of Brasília's Gama Campus.}, number={8}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2015}, month={Aug}, pages={72–75} } @article{byrd_2015, title={Spotlighting Student Innovation}, volume={48}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84923869694&partnerID=MN8TOARS}, DOI={10.1109/mc.2015.54}, abstractNote={This new column provides a space for undergraduates in computer engineering and science to share their capstone project designs.}, number={2}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2015}, month={Feb}, pages={75–76} } @article{byrd_2015, title={Tracking Cows Wirelessly}, volume={48}, ISSN={["1558-0814"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84933557206&partnerID=MN8TOARS}, DOI={10.1109/mc.2015.154}, abstractNote={A student team from NC State designed and built a prototype wireless network to monitor the milking and weighing of cows.}, number={6}, journal={COMPUTER}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, Greg}, year={2015}, month={Jun}, pages={60–63} } @article{reza_byrd_2013, title={Reducing Migration-Induced Misses In An Over-Subscribed Multiprocessor System}, volume={23}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84875530850&partnerID=MN8TOARS}, DOI={10.1142/s0129626413500060}, abstractNote={In a large multiprocessor server platform using multicore chips, the scheduler often migrates a thread or process, in order to achieve better load balancing or ensure fairness among competing scheduling entities. Each migration incurs a severe performance impact from the loss of cache and Translation Lookaside Buffer (TLB) footprints and subsequent higher cache misses and page walks. Such impact is likely to be more severe in virtualized environments, where high over-subscription of CPUs is very common for server consolidation workloads or virtual desktop infrastructure deployment, causing frequent migrations and context switches. We demonstrate the performance benefit of preserving a portion of L2 cache—in particular, MRU cache lines—and warming the destination L2 cache by prefetching those cache lines under different migration scenarios. We observed a 1.5-27% reduction in CPI (cycles per instruction) following a migration. We also study the effectiveness of preserving TLB entries over a context switch or migration.}, number={01}, journal={Parallel Processing Letters}, publisher={World Scientific Pub Co Pte Lt}, author={REZA, SAJJID and BYRD, GREGORY T.}, year={2013}, month={Mar}, pages={1350006} } @inproceedings{byrd_schneider_chang_ozev_2013, title={Welcome to ICCD 2013!}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84892493679&partnerID=MN8TOARS}, DOI={10.1109/ICCD.2013.6657012}, abstractNote={On behalf of the organizing and program committees, we welcome you to the 31st IEEE International Conference on Computer Design. ICCD is co-sponsored by the IEEE Computer Society and the IEEE Circuits and Systems Society. It covers a broad range of topics at all levels of computer system design, from logic and circuits to architecture and applications, including the tools and processes used for design, test, verification, and security. This makes it a fertile environment for cross-layer discussions and interactions.}, booktitle={2013 IEEE 31st International Conference on Computer Design, ICCD 2013}, author={Byrd, G. and Schneider, K. and Chang, N. and Ozev, S.}, year={2013} } @inproceedings{reza_byrd_2012, title={Reducing Migration-induced Cache Misses}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84867430979&partnerID=MN8TOARS}, DOI={10.1109/ipdpsw.2012.215}, abstractNote={In a large multiprocessor server platform, using multicore chips, the scheduler often migrates a scheduling entity, i.e. a thread or process or virtual machine, in order to achieve better load balancing or ensure fairness. The migration impact is likely to be more severe in virtualized environments, where high over-subscription of logical CPUs is very common for server consolidation workloads or virtual desktop infrastructure deployment. We demonstrate the performance benefit of saving and restoring cached data during migration. In particular, we measure the efficiency (benefit per cache block) of saving various subsets of the cached data, in order to balance implementation cost and complexity with improvements in cycle time. We also describe an implementation that moves cached data when a thread migrates, and we show the benefits in terms of reduced misses and reduced processor cycles.}, booktitle={2012 IEEE 26th International Parallel and Distributed Processing Symposium Workshops & PhD Forum}, publisher={IEEE}, author={Reza, Sajjid and Byrd, Gregory T.}, year={2012}, month={May}, pages={1732–1741} } @inproceedings{tahar_byrd_schneider_bose_2012, title={Welcome to ICCD 2012!}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84872063468&partnerID=MN8TOARS}, DOI={10.1109/ICCD.2012.6378605}, booktitle={Proceedings - IEEE International Conference on Computer Design: VLSI in Computers and Processors}, author={Tahar, S. and Byrd, G. and Schneider, K. and Bose, P.}, year={2012}, pages={11–20} } @inproceedings{grover_dhanotia_byrd_2011, title={A Canonical Multicore Architecture for Network Routers}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-81255150500&partnerID=MN8TOARS}, DOI={10.1109/ancs.2011.30}, abstractNote={There has been a significant increase in the Internet dynamics in the past decade. This has put tremendous pressure on the performance of routing protocols as they need to keep updating their routing information with every network change across the globe. With the growth of Internet, Border Gateway Protocol (BGP) has become a critical routing application. Good performance of BGP on network processors directly translates to better convergence time for route changes on the Internet, leading to reduced data loss on the network. BGP is the ubiquitous routing protocol on the Internet core, and hence analyzing its performance and exploring avenues for speeding it up can greatly help in improving the responsiveness and reliability of the Internet. In this paper, we investigate the use of multicore as the compute platform for routing protocols using BGP as a representative application. We discuss two different schemes for parallelizing BGP and analyze the performance of both serial and parallel BGP implementations on a fully configurable multicore simulation environment. Subsequently, we analyze the architectural bottlenecks in the conventional multicore systems which limit the speedup that can be achieved by software parallelism alone, and propose a canonical multicore architecture for routing protocols, which can be used for future routing processor designs. The analysis and proposed schemes in this paper would greatly help in understanding the behavior of BGP, thereby assisting in design and development of next generation network processors.}, booktitle={2011 ACM/IEEE Seventh Symposium on Architectures for Networking and Communications Systems}, publisher={IEEE}, author={Grover, Sabina and Dhanotia, Abhishek and Byrd, Gregory T.}, year={2011}, month={Oct}, pages={134–144} } @inproceedings{gaydadjiev_tahar_byrd_schneider_2011, title={Welcome to ICCD 2011!}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-83455217613&partnerID=MN8TOARS}, DOI={10.1109/ICCD.2011.6081366}, abstractNote={On behalf of the organizing and program committee, we would like to welcome you to the 29th IEEE International Conference on Computer Design 2011. The International Conference on Computer Design (ICCD) encompasses a wide range of technical topics and provides an ideal environment to discuss practical and theoretical work that enables cross-pollination. The ICCD venue and program reflect this goal. This year the conference is being held at the beautiful campus of the University of Massachusetts at Amherst, United States.}, booktitle={Proceedings - IEEE International Conference on Computer Design: VLSI in Computers and Processors}, author={Gaydadjiev, G. and Tahar, S. and Byrd, G. and Schneider, K.}, year={2011} } @inproceedings{dhanotia_grover_byrd_2010, title={Analyzing and scaling parallelism for network routing protocols}, ISBN={9781424492978}, url={http://dx.doi.org/10.1109/iiswc.2010.5650317}, DOI={10.1109/iiswc.2010.5650317}, abstractNote={The serial nature of legacy code in routing protocol implementations has inhibited a shift to multicore processing in the control plane, even though there is much inherent parallelism. In this paper, we investigate the use of multicore as the compute platform for routing applications using BGP, the ubiquitous protocol for routing in the Internet backbone, as a representative application. We develop a scalable multithreaded implementation for BGP and evaluate its performance on several multicore configurations using a fully configurable multicore simulation environment. We implement several optimizations at the software and architecture levels, achieving a speedup of 6.5 times over the sequential implementation, which translates to a throughput of ∼170K updates per second. Subsequently, we propose a generic architecture and parallelization methodology which can be applied to all routing protocol implementations to achieve significant performance improvement.}, booktitle={IEEE International Symposium on Workload Characterization (IISWC'10)}, publisher={IEEE}, author={Dhanotia, Abhishek and Grover, Sabina and Byrd, Greg}, year={2010}, month={Dec} } @article{chiang_byrd_2009, title={Adaptive aggregation tree transformation for energy-efficient query processing in sensor networks}, volume={6}, ISSN={["1748-1287"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-77955749470&partnerID=MN8TOARS}, DOI={10.1504/ijsnet.2009.028025}, abstractNote={Data aggregation reduces energy consumption by reducing the number of message transmissions in sensor networks. Effective aggregation requires that event messages be routed along common paths. While existing routing protocols provide many ways to construct the aggregation tree, this opportunistic style of aggregation is usually not optimal. The Minimal Steiner Tree (MST) maximises the possible degree of aggregation, but finding such a tree requires global knowledge of the network, which is not practical in sensor networks. In this paper, we propose the Adaptive Aggregation Tree (AAT) to dynamically transform the structure of the routing tree to improve the efficiency of data aggregation. It adapts to changes in the set of source nodes automatically, and approaches the cost savings of MST without explicit maintenance of an infrastructure. The evaluation results show that AAT reduces the communication energy consumption by 23%, compared to shortest-path tree, and by 31%, compared to GPSR.}, number={1}, journal={INTERNATIONAL JOURNAL OF SENSOR NETWORKS}, publisher={Inderscience Publishers}, author={Chiang, Mu-Huan and Byrd, Gregory T.}, year={2009}, pages={51–64} } @inproceedings{pant_byrd_2009, title={Extending concurrency of transactional memory programs by using value prediction}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84885764504&partnerID=MN8TOARS}, DOI={10.1145/1531743.1531748}, abstractNote={Transactional Memory (TM) is an optimistic speculative synchronization scheme that provides atomic execution for a region of code marked as a transaction by the programmer. TM avoids many of the problems associated with lock-based synchronization and can make writing parallel programs relatively easier. Programs with critical sections that are not heavily contended benefit from the optimistic nature of TM systems. However, for heavily contended critical sections, performance can degrade due to conflicts leading to stalls and expensive rollbacks. In this paper, we look into the nature of the shared data involved in conflicts for TM systems. We find that most transactions have conflicts around a few shared addresses, and shared-conflicting data is often updated in a predictable manner by different transactions. We propose using a memory-level value predictor to capture this predictability for such data structures and increase overall concurrency by satisfying loads from conflicting transactions with predicted values, instead of stalling. In this paper, we present one possible design and implementation of a TM system with a value predictor. Our benchmark results show that the value predictor can capture this predictable behavior for most benchmarks and can improve performance of TM programs by improving concurrency and minimizing stalls and rollbacks.}, booktitle={Proceedings of the 6th ACM conference on Computing frontiers - CF '09}, publisher={ACM Press}, author={Pant, Salil Mohan and Byrd, Gregory T.}, year={2009}, pages={11–20} } @inproceedings{pant_byrd_2009, title={Limited early value communication to improve performance of transactional memory}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-70449713893&partnerID=MN8TOARS}, DOI={10.1145/1542275.1542334}, abstractNote={Parallel programming is receiving renewed attention with the advent of multi-core CPU architectures. The Transactional Memory (TM) paradigm has the potential to provide good speedup and make parallel programming easier to adopt. Under low contention, it has been shown that TM programs can outperform standard lock-based programs. However, under high contention, performance of TM programs can degrade. Previous work has shown that we can use either data forwarding or value prediction to improve performance under high contention. Both these techniques demand significant changes to the architecture and coherence protocol above and beyond those required by TM. In this work, we analyze and compare these approaches. Our objective is to find a solution that improves performance without needing significant hardware additions or changes to the coherence protocol. We observe that for most transactions conflicts are limited to only a few threads at a time. We design a system that uses this knowledge to reduce the hardware for a TM system that tries to avoid conflicts using early value communication. Our results show that we can get comparable performance of the proposed techniques with minimal extra hardware.}, booktitle={Proceedings of the 23rd international conference on Conference on Supercomputing - ICS '09}, publisher={ACM Press}, author={Pant, Salil M. and Byrd, Gregory T.}, year={2009}, pages={421–429} } @inproceedings{altunay_byrd_brown_dean_2008, title={An interaction-based access control model (IBAC) for collaborative services}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-50949123932&partnerID=MN8TOARS}, DOI={10.1109/cts.2008.4543977}, abstractNote={A collaboration is a collection of services that work together to achieve a common goal. Although collaborations help when tackling difficult problems, they lead to security issues. First, a collaboration is often performed by services that are drawn from different security domains. Second, a service interacts with multiple peer services during the collaboration. These interactions are not isolated from one another - e.g., data may flow through a sequence of different services. As a result, a service is exposed to multiple peer services in varying degrees, leading to different security threats. We identify the types of interactions that can be present in collaborations, and discuss the security threats due to each type. We propose a model for representing the collaboration context so that a service can be made aware of the existing interactions. We provide an access control model for a service participating in a collaboration. We couple our access control model with a policy model, so that the access requirements from collaborations can be expressed and evaluated.}, booktitle={2008 International Symposium on Collaborative Technologies and Systems}, publisher={IEEE}, author={Altunay, Mine and Byrd, Gregory T. and Brown, Doug E. and Dean, Ralph A.}, year={2008}, month={May}, pages={547–554} } @inproceedings{lim_byrd_2008, title={Exploiting producer patterns and L2 cache for timely dependence-based prefetching}, DOI={10.1109/iccd.2008.4751935}, abstractNote={This paper proposes an architecture that efficiently prefetches for loads whose effective addresses are directly dependent on previously-loaded values. This dependence-based prefetching scheme covers most frequently missed loads in programs that contain linked data structures (LDS). For timely prefetches, memory access patterns of producing loads are dynamically learned. These patterns (such as strides) are used to prefetch well ahead of the consumer load. The proposed prefetcher is placed near the processor core and targets L1 cache misses, because removing L1 cache misses has greater performance potential than removing L2 cache misses. We also examine how to capture pointers in LDS with pure hardware implementation. We find that the space requirement can be reduced, compared to previous work, if we selectively record patterns. Still, to make the prefetching scheme generally applicable, a large table is required for storing pointers. We show that storing the prefetch table in a partition of the L2 cache outperforms using the L2 cache conventionally.}, booktitle={2008 IEEE International Conference on Computer Design}, publisher={IEEE}, author={Lim, Chungsoo and Byrd, Gregory T.}, year={2008}, month={Oct} } @inproceedings{chiang_byrd_2008, title={Neighborhood-Aware Density Control in Wireless Sensor Networks}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-50949085022&partnerID=MN8TOARS}, DOI={10.1109/sutc.2008.44}, abstractNote={In dense wireless sensor networks, density control is an important technique for prolonging network's lifetime. However, due to the intrinsic many-to-one communication pattern of sensor networks, nodes close to the sink tend to deplete their energy faster than other nodes. This unbalanced energy usage among nodes significantly reduces the network lifetime. In this paper, we propose neighborhood-aware density control (NADC) to alleviate this undesired effect by reducing unnecessary overhearing along routing paths. In NADC, nodes observe their neighborhoods and dynamically adapt their participation in the multihop network topology. Since the neighborhood information can be easily observed through the overheard information, the density in different regions can be adaptively adjusted in a totally distributed manner. Simulation experiments demonstrate that NADC alleviates the extremely unbalanced workload and extends the effective network lifetime without significant increase in data delivery latency.}, booktitle={2008 IEEE International Conference on Sensor Networks, Ubiquitous, and Trustworthy Computing (sutc 2008)}, publisher={IEEE}, author={Chiang, Mu-Huan and Byrd, Gregory T.}, year={2008}, month={Jun}, pages={122–129} } @article{chiang_byrd_2007, title={Zone Repartitioning: A Load‐Balancing Mechanism for Data‐Centric Storage Systems}, volume={2}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84893362518&partnerID=MN8TOARS}, DOI={10.1108/17427370780000161}, abstractNote={Data‐centric storage is an efficient scheme to store and retrieve event data in sensor networks, but with the multi‐hop routing nature of sensor networks, the communication cost of the home nodes and their neighboring nodes tends to be much higher than the other nodes. These hot‐spots can adversely impact system lifetime by draining off their limited energy rapidly. In this paper, we present Zone‐Repartitioning, a load‐balancing mechanism that reduces the energy consumption of the hot‐spots by distributing their communication load while event frequency is high. The trade‐off between event storage cost and query cost makes Zone Repartitioning a competitive approach in different kinds of applications. We compare the performance of Zone Repartitioning against GHT and show that Zone Repartitioning provides better adaptability in various sensor network scenarios.}, number={4}, journal={International Journal of Pervasive Computing and Communications}, publisher={Emerald}, author={Chiang, Mu‐Huan and Byrd, Gregory T.}, year={2007}, month={Sep}, pages={312–320} } @article{altunay_brown_byrd_dean_2006, title={Collaboration Policies: Access Control Management in Decentralized Heterogeneous Workflows}, volume={1}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-79960274089&partnerID=MN8TOARS}, DOI={10.4304/jsw.1.1.11-22}, abstractNote={Service-oriented computing promotes collaboration by defining the standards layer that allows compatibility between disparate domains. Workflows, by taking advantage of the service oriented framework, provide the necessary tools to harness services in order to tackle complicated problems. As a result, a service is no longer exposed to a small pre-determined homogeneous pool of users; instead it has a large, undefined, and heterogeneous pool of users. This paradigm shift in computing results in increased service exposure. The interactions among the services of a workflow must be carefully evaluated against the security risks associated with them. Classical security problems, such as delegation of rights, conflict of interest, and access control in general, become more complicated due to multiple autonomous security domains and the absence of pre- established trust relationships among the domains. Our work tackles these problems in two aspects: it provides a service owner with the necessary means to express and evaluate its trust requirements from a workflow (collaboration policies), and it incorporates these trust requirements into the workflow-planning framework (workflow authorization framework). Our policy-based framework allows bilateral peer-level trust evaluations that are based on each peer’s collaboration policies, and incorporates the outcome of these evaluations into the workflow planning logic. As a result, our work provides the necessary tools for promoting multi-party ad-hoc collaborations, and aims to reduce the reluctance and hesitation towards these collaborations by attacking the security risks associated with them.}, number={1}, journal={Journal of Software}, publisher={International Academy Publishing (IAP)}, author={Altunay, Mine and Brown, Douglas E. and Byrd, Gregory T. and Dean, Ralph A.}, year={2006}, month={Jul}, pages={11–22} } @inproceedings{lai_byrd_2006, title={High-throughput sketch update on a low-power stream processor}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-34547662261&partnerID=MN8TOARS}, DOI={10.1145/1185347.1185364}, abstractNote={Sketch algorithms are widely used for many networking applications, such as identifying frequent items, top-k flows, and traffic anomalies. This paper explores the implementation of the Count-Min sketch update using Indexed SRF accesses on a SIMD stream processor (Imagine). Both the sketch data structure and the packet stream are modeled as streams, and in-lane accesses to the stream register file (SRF) support concurrent updates without explicit synchronization. The 500-MHz stream processor is capable of supporting sketch update at 10 Gbps throughput for minimum- sized IP packets. This is nearly the same performance as the 1.4-GHz Intel IXP2800 (13 Gbps), using significantly less power (2.89 W vs. 21 W).}, booktitle={Proceedings of the 2006 ACM/IEEE symposium on Architecture for networking and communications systems - ANCS '06}, publisher={ACM Press}, author={Lai, Yu-Kuen and Byrd, Gregory T.}, year={2006}, pages={123–132} } @inproceedings{lai_byrd_2006, title={Stream-Based Implementation of Hash Functions for Multi-Gigabit Message Authentication Codes}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-38949196729&partnerID=MN8TOARS}, DOI={10.1109/pdcat.2006.104}, abstractNote={Stream processing architectures have been proposed as efficient and flexible platforms for network packet processing. As part of an investigation into stream-based network processors, we have implemented MMH, a family of almost-universal hash functions for message authentication, on a SIMD stream processor (Imagine). The hash computation over an entire packet is a good fit for the stream programming model, with an abundance of producer-consumer locality: hash values are computed and stored in the stream register file (SRF), then used for calculating new hash values repeatedly. By using eight VLIW clusters, the construction is performed in a multi-SIMD fashion, achieving multi-gigabit-per-second throughput with a collision probability on the order of 2~120}, booktitle={2006 Seventh International Conference on Parallel and Distributed Computing, Applications and Technologies (PDCAT'06)}, publisher={IEEE}, author={Lai, Yu-kuen and Byrd, Gregory}, year={2006}, pages={150–155} } @inproceedings{altunay_brown_byrd_dean_2005, title={Evaluation of Mutual Trust during Matchmaking}, volume={2005}, ISBN={0769523765}, url={http://dx.doi.org/10.1109/p2p.2005.9}, DOI={10.1109/p2p.2005.9}, abstractNote={The authors introduced a new service discovery and matchmaking architecture, layered on top of Globus MDS3, that integrates mutual trust evaluations into the matchmaking process. The architecture adopts a symmetric approach, and checks trust policies of both grid users and resources without requiring policy disclosures. This approach eliminates run-time security failures arising from incompatible user/resource pairs, seamlessly integrates user-side authorization tools with the matchmaking process, and protects naive grid users by allowing a security principal to define policies that control the list of discoverable resources.}, booktitle={Fifth IEEE International Conference on Peer-to-Peer Computing (P2P'05)}, publisher={IEEE}, author={Altunay, M. and Brown, D. and Byrd, G. and Dean, R.A.}, year={2005}, month={Dec}, pages={133–140} } @article{rai_lai_byrd_2005, title={Packet processing on a SIMD stream processor}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84882762165&partnerID=MN8TOARS}, DOI={10.1016/b978-012088476-6/50008-3}, abstractNote={This chapter explores the application of stream architectures to packet processing tasks, IPv4 Forwarding and AES encryption in particular. Both applications were run on generic stream architecture (Imagine), and experiments were conducted to characterize the performance of both applications for different configurations of this architecture. For a system clock of 500 MHz, the throughput of the AES encryption in ECB mode varies from 2.02 Gb/s (96-block packets) to 0.8 Gb/s (8-block packets). The IPv4 Forwarding application, with a configuration of one Imagine with 8 clusters, delivered a worst-case performance of around 67 cycles per packet, for a packet trace constructed from the MAE-WEST routing table. Hence, the forwarding engine was able to support packet traffic coming at a rate of OC-48 assuming a clock frequency of 500 MHz. The LRF and memory bandwidth characteristics of the two packet processing applications confirm that they are at two different ends of the application spectrum, with one being memory-intensive and the other being computation intensive. The low SRF characteristics for the two packet processing applications, compared to the Depth media application, is due to the fact that the processing of the packets for both applications is done primarily in one kernel. This results in reduced trips to the SRF between kernels, decreasing the SRF bandwidth utilization. The packet processing applications have comparable metrics to that of media applications indicating that this architecture could be as useful for network applications as it is for media applications.}, journal={Network Processor Design}, publisher={Elsevier}, author={Rai, Jathin S. and Lai, Yu-Kuen and Byrd, Gregory T.}, year={2005}, pages={119–144} } @inbook{altunay_brown_byrd_dean_2005, series={Lecture Notes in Computer Science}, title={Trust-Based Secure Workflow Path Construction}, volume={3826}, ISBN={9783540749738 9783540749745}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/11596141_29}, DOI={10.1007/11596141_29}, abstractNote={Security and trust relationships between services significantly govern their willingness to collaborate and participate in a workflow. Existing workflow tools do not consider such relationships as an integral part of their planning logic: rather, they approach security as a run-time issue. We present a workflow management framework that fully integrates trust and security into the workflow planning logic. It considers not only trust relationships between the workflow requestor and individual services, but also trust relationships among the services themselves. It allows each service owner to define an upper layer of collaboration policies (rules that specify the terms under which participation in a workflow is allowed) and integrates them into the planning logic. Services that are unfit for collaboration due to security violations are replaced at the planning stage. This approach increases the services owners’ control over the workflow path, their willingness for collaboration, and avoids run-time security failures.}, booktitle={Service-Oriented Computing – ICSOC 2007}, publisher={Springer Berlin Heidelberg}, author={Altunay, M. and Brown, D. and Byrd, G. and Dean, R.}, editor={Benatallah, B. and Casati, F. and Traverso, P.Editors}, year={2005}, pages={382–395}, collection={Lecture Notes in Computer Science} } @article{lai_byrd_2004, title={AES Packet Encryption on a SIMD Stream Architecture}, number={Special Issue on Cryptographic Hardware and Embedded Systems}, journal={International Journal of Computer Research}, author={Lai, Y. and Byrd, G.T.}, year={2004} } @inproceedings{smith_byrd_wu_xin_thangavelu_wang_shah_2004, title={Dynamic PKI and secure tuplespaces for distributed coalitions}, ISBN={0769518974}, url={http://dx.doi.org/10.1109/discex.2003.1194884}, DOI={10.1109/discex.2003.1194884}, abstractNote={The Yalta project has developed a scalable, reliable application platform for distributed coalitions. The key components of this infrastructure are a reliable, secure tuplespace service, an intrusion-tolerant, threshold-based certification authority, and a scalable certificate revocation and event notification service. These components combine to provide a highly transparent security infrastructure for distributed, dynamic coalition applications.}, booktitle={Proceedings DARPA Information Survivability Conference and Exposition}, publisher={IEEE Comput. Soc}, author={Smith, T.J. and Byrd, G.T. and Wu, Xiaoyong and Xin, Hongjie and Thangavelu, K. and Wang, R. and Shah, A.}, year={2004}, month={Mar} } @inproceedings{ibrahim_byrd_2004, title={Extending OpenMP to support slipstream execution mode}, ISBN={0769519261}, url={http://dx.doi.org/10.1109/ipdps.2003.1213119}, DOI={10.1109/ipdps.2003.1213119}, abstractNote={OpenMP has emerged as a widely accepted standard for writing shared memory programs. Hardware-specific extensions such as data placement are usually needed to improve the scalability of applications based on this standard. This paper investigates the implementation of an OpenMP compiler that supports slipstream execution mode, a new optimization mechanism for CMP-based distributed shared memory multiprocessors. Slipstream mode uses additional processors to reduce communication overhead, rather than to increase parallelism. We discuss how each OpenMP construct can be implemented to take advantage of slipstream mode, and we present a minor extension that allows runtime or compile-time control of slipstream execution. We also investigate the interaction between slipstream mechanisms and OpenMP scheduling. Our implementation supports both static and dynamic scheduling in slipstream mode. We extended the Omni OpenMP compiler to generate binaries that support slipstream mode, and we show the performance of slipstream-enabled codes using OpenMP codes from the NAS Parallel Benchmark suite, running on the SimOS simulator. Our extension to OpenMP allowed the benchmarks to achieve an average performance improvement of 14% with static scheduling. For dynamic scheduling the performance improvement is 12% on average.}, booktitle={Proceedings International Parallel and Distributed Processing Symposium}, publisher={IEEE Comput. Soc}, author={Ibrahim, K.Z. and Byrd, G.T.}, year={2004}, month={Mar}, pages={10} } @inproceedings{smith_byrd_2004, title={Yalta: a dynamic PKI and secure tuplespaces for distributed coalitions}, ISBN={0769518974}, url={http://dx.doi.org/10.1109/discex.2003.1194913}, DOI={10.1109/discex.2003.1194913}, abstractNote={The Yalta project has developed a scalable, reliable application platform for distributed coalitions. The key components of this infrastructure are a reliable, secure tuplespace service, an intrusion-tolerant, threshold-based certification authority, and a scalable certificate revocation and event notification service. These components combine to provide a highly transparent security infrastructure for distributed, dynamic coalition applications.}, booktitle={Proceedings DARPA Information Survivability Conference and Exposition}, publisher={IEEE Comput. Soc}, author={Smith, T.J. and Byrd, G.T.}, year={2004}, month={Mar} } @article{suryanarayanan_marshall_byrd_2003, title={A Methodology and Simulator for the Study of Network Processors}, DOI={10.1016/b978-155860875-7.50021-1}, abstractNote={Network processors (NPs) are emerging new class of processors that combine programmable ASICs and microprocessors to implement adaptive network services. NPs influence the flexibility of software solutions with the high performance of custom hardware. The development of such sophisticated hardware requires a holistic methodology that can facilitate the study of network processors and their performance with different networking applications and traffic conditions. It is noted that this combination of study techniques is essentially accomplished in the component network simulator (ComNetSim). The simulator includes both a traffic-modeling component and a detailed architectural framework that allows the study of complete networking applications under varying network traffic conditions. The chapter illustrates a weighted round robin scheduling algorithm, adapted to the Toaster architecture. It describes high-level simulator design and details the Toaster network processor and the implementation of the simulator including the cycle-accurate model of the Toaster architecture. The chapter also briefly presents the simulator organization along with performance results and analysis.}, journal={Network Processor Design}, publisher={Elsevier}, author={Suryanarayanan, Deepak and Marshall, John and Byrd, Gregory T.}, year={2003}, pages={27–54} } @inproceedings{öztürk_trussell_townsend_byrd_mortazavi_baran_conte_o’neal_bilbro_brickley_2003, title={A new introductory laboratory course for electrical and computer engineering}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-8744285585&partnerID=MN8TOARS}, booktitle={ASEE Annual Conference Proceedings}, author={Öztürk, M.C. and Trussell, J. and Townsend, C. and Byrd, G. and Mortazavi, A. and Baran, M. and Conte, T. and O’Neal, B. and Bilbro, G. and Brickley, J.}, year={2003}, pages={11378–11391} } @article{wang_wang_byrd_2003, title={Design and implementation of Acceptance Monitor for building intrusion tolerant systems}, volume={33}, ISSN={["0038-0644"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-0242408350&partnerID=MN8TOARS}, DOI={10.1002/spe.554}, abstractNote={Abstract}, number={14}, journal={SOFTWARE-PRACTICE & EXPERIENCE}, publisher={Wiley}, author={Wang, R and Wang, FY and Byrd, GT}, year={2003}, month={Nov}, pages={1399–1417} } @article{ibrahim_byrd_rotenberg_2003, title={Slipstream execution mode for CMP-based multiprocessors}, volume={12}, ISBN={["0-7695-1871-0"]}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84955465003&partnerID=MN8TOARS}, DOI={10.1109/hpca.2003.1183536}, abstractNote={Scalability of applications on distributed shared-memory (DSM) multiprocessors is limited by communication overheads. At some point, using more processors to increase parallelism yields diminishing returns or even degrades performance. When increasing concurrency is futile, we propose an additional mode of execution, called slipstream mode, that instead enlists extra processors to assist parallel tasks by reducing perceived overheads. We consider DSM multiprocessors built from dual-processor chip multiprocessor (CMP) nodes with shared L2 cache. A task is allocated on one processor of each CMP node. The other processor of each node executes a reduced version of the same task. The reduced version skips shared-memory stores and synchronization, running ahead of the true task. Even with the skipped operations, the reduced task makes accurate forward progress and generates an accurate reference stream, because branches and addresses depend primarily on private data. Slipstream execution mode yields two benefits. First, the reduced task prefetches data on behalf of the true task. Second, reduced tasks provide a detailed picture of future reference behavior, enabling a number of optimizations aimed at accelerating coherence events, e.g., self-invalidation. For multiprocessor systems with up to 16 CMP nodes, slipstream mode outperforms running one or two conventional tasks per CMP in 7 out of 9 parallel scientific benchmarks. Slipstream mode is 12-19% faster with prefetching only and up to 29% faster with self-invalidation enabled.}, journal={NINTH INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE COMPUTER ARCHITECTURE, PROCEEDINGS}, publisher={IEEE Comput. Soc}, author={Ibrahim, KZ and Byrd, GT and Rotenberg, E}, year={2003}, pages={179–190} } @inproceedings{wang_wang_byrd_2002, title={Design and implementation of acceptance monitor for building scalable intrusion tolerant system}, volume={2001-January}, ISBN={0780371283}, url={http://dx.doi.org/10.1109/icccn.2001.956241}, DOI={10.1109/icccn.2001.956241}, abstractNote={Intrusion detection research has so far mostly concentrated on techniques that effectively identify malicious behavior. No assurance can be assumed once the system is compromised. Intrusion tolerance, on the other hand, focuses on providing minimal level of services even when some components have been partially compromised. The challenges here are how to take advantage of fault tolerant techniques in the intrusion tolerant system context and how to deal with possible unknown attacks and compromised components so as to continue providing the service. This paper presents our work on applying one important fault tolerance technique, acceptance testing, for building scalable intrusion tolerant systems. First, we propose a general methodology for designing acceptance tests. An acceptance monitor architecture is proposed to apply various tests for detecting compromises based on the impact of the attacks. Second, we make a comprehensive vulnerability analysis on typical commercial-off-the-shelf (COTS) Web servers. Various acceptance testing modules are implemented to show the effectiveness of the proposed approach. By utilizing the fault tolerance techniques on intrusion tolerance system, we provide a mechanism for building reliable distributed services that are more resistant to both known and unknown attacks.}, booktitle={Proceedings Tenth International Conference on Computer Communications and Networks (Cat. No.01EX495)}, publisher={IEEE}, author={Wang, Rong and Wang, Feiyi and Byrd, G.T.}, year={2002}, month={Nov}, pages={200–205} } @inproceedings{stevenson_hillery_byrd_gong_winkelstein_2002, title={Design of a key agile cryptographic system for OC-12c rate ATM}, ISBN={0818670274}, url={http://dx.doi.org/10.1109/ndss.1995.390648}, DOI={10.1109/ndss.1995.390648}, abstractNote={The paper describes an experimental key agile cryptographic system under design at MCNC. The system is compatible with ATM local- and wide-area networks. The system establishes and manages secure connections between hosts in a manner which is transparent to the end users and compatible with existing public network standards. A Cryptographic Unit supports hardware encryption and decryption at the ATM protocol layer. The system is SONET compatible and operates full duplex at the OC-12c rate (622 Mbps). Separate encryption keys are negotiated for each secure connection. Each Cryptographic Unit can manage more than 65,000 active secure connections. The Cryptographic Unit can be connected either in a security gateway mode referred to as a 'bump-in-the-fiber' or as a direct ATM host interface. Authentication and access control are implemented through a certificate-based system. The current status of the system is that hardware and software detail designs have been completed. An early version of the key management software has been completed and demonstrated. Hardware fabrication and systems integration are expected to take place over the next several months. Once completed the proof-of concept system will be used to explore issues of privacy, access control and authentication in relation to communications over emerging public networks.<>}, booktitle={Proceedings of the Symposium on Network and Distributed System Security}, publisher={IEEE Comput. Soc. Press}, author={Stevenson, D. and Hillery, N. and Byrd, G. and Gong, Fengmin and Winkelstein, D.}, year={2002}, month={Nov}, pages={17–30} } @inproceedings{byrd_flynn_2002, title={Effectiveness of producer-initiated communication}, ISBN={0818682558}, url={http://dx.doi.org/10.1109/hicss.1998.649281}, DOI={10.1109/hicss.1998.649281}, abstractNote={Producer-initiated communication mechanisms have been proposed to reduce communication latency in distributed shared memory systems. These mechanisms aim to move data close to its consumers, as soon as it is produced. The data is then available locally when needed by the consumer, avoiding the latency of retrieving it from global memory or from the producer's cache. Studies have shown that these sorts of mechanisms are effective, in that they reduce latency and improve execution time, compared to plain invalidate-based cache coherence. It is not clear, however, whether producer-initiated mechanisms provide a significant advantage over prefetch or other consumer-oriented mechanisms designed to hide or reduce latency. The authors look at the published evidence and draw some conclusions.}, booktitle={Proceedings of the Thirty-First Hawaii International Conference on System Sciences}, publisher={IEEE Comput. Soc}, author={Byrd, G.T. and Flynn, M.J.}, year={2002}, month={Nov} } @inproceedings{ibrahim_byrd_2002, title={On the exploitation of value prediction and producer identification to reduce barrier synchronization time}, ISBN={0769509908}, url={http://dx.doi.org/10.1109/ipdps.2001.924981}, DOI={10.1109/ipdps.2001.924981}, abstractNote={Barrier synchronization is a source of inefficiency in many parallel programs, due to the association of many producer-consumer relations in with one synchronization variable. This inefficiency may consume a significant percentage of total execution time, especially as we increase the degree of parallelism while maintaining the problem size. Barrier synchronization wait time can be hidden by speculatively executing instructions after the barrier. The speculative execution must not violate the dependencies imposed by the program. Dependency violation causes rollback, incurring a penalty that may exceed the benefit of speculation. In this work, we investigate how to reduce the probability of rollback through the use of two different techniques: value prediction and producer identification. The first technique tries to break the dependency between the running processes. The second technique tries to respect only true dependencies by transforming the barrier synchronization into per-variable flags. Simulation results using scientific benchmarks mostly SPLASH-2, indicate that producer identification promises a greater potential reduction in synchronization time, close to actual dependency, and maintains rollback percentage below 10% for most benchmarks.}, booktitle={Proceedings 15th International Parallel and Distributed Processing Symposium. IPDPS 2001}, publisher={IEEE Comput. Soc}, author={Ibrahim, K.Z. and Byrd, G.T.}, year={2002}, month={Nov} } @inproceedings{byrd_hillery_symon_2001, title={Practical Experiences with ATM Encryption}, note={bibtex: tc_18002 annote: CV Header: Conference annote: G. T. Byrd, N. Hillery, and J. Symon. “Practical Experiences with ATM Encryption.” Network and Distributed System Security Symposium, pp. 23-32, February 2001}, booktitle={Network and Distributed System Security Symposium}, author={Byrd, G.T. and Hillery, N. and Symon, J.}, year={2001}, month={Feb}, pages={23–32,} } @article{byrd_flynn_1999, title={Producer-consumer communication in distributed shared memory multiprocessors}, volume={87}, ISSN={0018-9219}, url={http://dx.doi.org/10.1109/5.747866}, DOI={10.1109/5.747866}, abstractNote={The shared memory abstraction supported by hardware based distributed shared memory (DSM) multiprocessors is an inherently consumer driven means of communication. When a process requires data, it retrieves them from the global shared memory. In distributed cache coherent systems, the data may reside in a remote memory module or in the producer's cache. Producer initiated mechanisms reduce communication latency by sending data to the consumer as soon as they are produced. We classify producer initiated mechanisms as implicit or explicit, according to whether the producer must know the identity of the consumer when data are transmitted. Explicit schemes include data forwarding and message passing. Implicit schemes include update based coherence, selective updates, and cache based locks. Several of these mechanisms are evaluated for performance and sensitivity to network parameters, using a common simulated architecture and a set of application kernel benchmarks. StreamLine, a cache based message passing mechanism, provides the best performance on the benchmarks with regular communication patterns. Forwarding write and cache based locks are also among the best performing producer initiated mechanisms. Consumer initiated prefetch, however, has good average performance and is the least expensive to implement.}, number={3}, journal={Proceedings of the IEEE}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, G.T. and Flynn, M.J.}, year={1999}, month={Mar}, pages={456–466} } @inproceedings{byrd_flynn_1998, title={Effectiveness of producer-initiated communication}, volume={7}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-0031606155&partnerID=MN8TOARS}, booktitle={Proceedings of the Hawaii International Conference on System Sciences}, author={Byrd, Gregory T. and Flynn, Michael J.}, year={1998}, pages={770–771} } @article{byrd_flynn_1998, title={Evaluation of Communication Mechanisms in Invalidate-based Shared Memory Multiprocessors}, volume={1417}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-84957398828&partnerID=MN8TOARS}, DOI={10.1007/3-540-69352-1_14}, abstractNote={Producer-initiated mechanisms are added to invalidate- based systems to reduce communication latencies by transferring data as soon as it is produced. This paper compares the performance of three producer-initiated mechanisms: lock, deliver, and StreamLine. All three approaches out-perform invalidate with prefetch in most cases.Cached-based locks offer 10–20% speedup over prefetch for two of the three benchmarks studies. StreamLine performs well in low-bandwidth environments, but does not improve with increased bandwidth. Deliver is generally competitive with prefetch, but does not offer a significant performance advantage overall.}, journal={Parallel Computer Routing and Communication}, publisher={Springer Berlin Heidelberg}, author={Byrd, Gregory T. and Flynn, Michael J.}, year={1998}, pages={159–170} } @article{byrd_holliday_1995, title={Multithreaded processor architectures}, volume={32}, ISSN={0018-9235}, url={http://dx.doi.org/10.1109/6.402166}, DOI={10.1109/6.402166}, abstractNote={The authors describe how independent streams of instructions, interwoven on a single processor, fill its otherwise idle cycles and so boost its performance. They detail how such multithreaded architectures take the tack of hiding latency by supporting multiple concurrent streams of execution. When a long-latency operation occurs in one of the threads, another begins execution. In this way, useful work is performed while the time-consuming operation is completed. >}, number={8}, journal={IEEE Spectrum}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Byrd, G.T. and Holliday, M.A.}, year={1995}, pages={38–46} } @article{stevenson_hillery_byrd_1995, title={Secure communications in ATM networks}, volume={38}, ISSN={0001-0782}, url={http://dx.doi.org/10.1145/204826.204844}, DOI={10.1145/204826.204844}, abstractNote={High-speed networking technology and standards have progressed dramatically in the past few years and much attention is now focused on deployment efforts, such as the North Carolina Information Highway (NCIH) [7], and applications. With this shift in emphasis, concerns have been raised about information security. Examples of abuse of the Internet abound and unfortunately ATM networks are subject to many of these same abuses. This is of subtanstial concern when thinking about extending the reach of public data networking to broad segments of society.}, number={2}, journal={Communications of the ACM}, publisher={Association for Computing Machinery (ACM)}, author={Stevenson, Daniel and Hillery, Nathan and Byrd, Greg}, year={1995}, month={Feb}, pages={45–52} } @inproceedings{byrd_delagi_1991, title={StreamLine: Cache-Based Message Passing in Scalable Multiprocessors}, volume={I}, booktitle={20th International Conference on Parallel Processing}, author={Byrd, G.T. and Delagi, B.A.}, year={1991}, month={Aug}, pages={251–254} } @inproceedings{byrd_saraiya_delagi_1989, title={Multicast Communication in Multiprocessor Systems}, volume={I}, url={http://www.scopus.com/inward/record.url?eid=2-s2.0-0024875963&partnerID=MN8TOARS}, booktitle={18th International Conference on Parallel Processing}, author={Byrd, G.T. and Saraiya, N. and Delagi, B.A.}, year={1989}, month={Aug}, pages={196–200} } @inproceedings{byrd_delagi_1988, place={Boston, MA}, title={A Performance Comparison of Shared Variable vs. Message Passing}, booktitle={Third International Conference on Supercomputing (ICS88)}, author={Byrd, G.T. and Delagi, B.A.}, year={1988} } @inproceedings{delagi_saraiya_nishimura_byrd_1988, place={San Diego, CA}, title={An Instrumented Architectural Simulation System}, note={bibtex: tc_18074 annote: CV Header: Conference annote: B. A. Delagi, N. Saraiya, S. Nishimura, and G. Byrd. “An Instrumented Architectural Simulation System.” SCS Multiconference on Artificial Intelligence and Simulation, San Diego, CA, pp. 111-120, Feb. 1988}, booktitle={SCS Multiconference on Artificial Intelligence and Simulation}, author={Delagi, B.A. and Saraiya, N. and Nishimura, S. and Byrd, G.}, year={1988}, month={Feb}, pages={111–120,} } @inproceedings{delagi_saraiya_nishimura_byrd_1988, place={Boston, MA}, title={Instrumented Architectural Simulation}, booktitle={Third International Conference on Supercomputing (ICSS88)}, author={Delagi, B.A. and Saraiya, N. and Nishimura, S. and Byrd, G.}, year={1988} } @inproceedings{delagi_saraiya_byrd_1988, place={Boston, MA}, title={LAMINA: CARE Applications Interface}, booktitle={Third International Conference on Supercomputing (ICS88)}, author={Delagi, B.A. and Saraiya, N.P. and Byrd, G.T.}, year={1988} }