@article{nagabhiru_byrd_2024, title={Achieving Forward Progress Guarantee in Small Hardware Transactions}, volume={23}, ISSN={["1556-6064"]}, url={https://doi.org/10.1109/LCA.2024.3370992}, DOI={10.1109/LCA.2024.3370992}, abstractNote={Hardware-transactional-memory (HTM) manages to pique interest from academia and industry alike because of its potential to ease concurrent-programming without compromising on performance. It offers a simple “all-or-nothing” idea to the programmer, making a piece of code appear atomic in hardware. Despite this and many elegant HTM implementations in research, only best-effort HTM is available commercially. Best-effort HTM lacks forward progress guarantee making it harder for the programmer to create a concurrent scalable fallback path. This has made HTM's adaptability limited. With a scope to support a myriad of applications, HTMs do a trade off between design and verification complexity vs forward progress guarantee. In this letter, we argue that limiting the scope of applications helps HTM attain guaranteed forward progress. We support lock-free programs by using HTM as multi-word-atomics and demonstrate strategic design choices to achieve lock-freedom completely in hardware. We use lfbench, a lock-free micro-benchmark-suite, and Arm's best-effort HTM (ARM_TME) on the gem5 simulator, as our base. We demonstrate the performance tradeoffs between design choices of a deferral-based, NACK-based, and NACK-with-backoff approaches. We show that NACK-with-backoff performs better than the others without compromising scalability for both read- and write-intensive applications.}, number={1}, journal={IEEE COMPUTER ARCHITECTURE LETTERS}, author={Nagabhiru, Mahita and Byrd, Gregory T.}, year={2024}, month={Jan}, pages={53–56} } @article{nagabhiru_byrd_2023, title={lfbench: a lock-free microbenchmark suite}, DOI={10.1109/ISPASS57527.2023.00040}, abstractNote={In this work, we present lfbench: a microbenchmark suite intended as a one-stop shop representing all the popular lock-free data structures. Lock-free programming is very complex and so hard that there hasn’t been a generalized lockfree algorithm designed; instead, lock-free data structures are individually developed and optimized for the specific use-cases. In spite of this difficulty, lock-free programs are indispensable; OS kernel codes, popular databases, networking buffers, and so forth, all rely on lock-free data structures for the performance and scalability they provide. We attempt for the first time to bring all the popular lock-free data structures under one roof, primarily to enable development of new WW semantics needed for easy lock-free programming and help evaluate the same. Additionally, the benchmark suite can be used for:1)Performance analysis of any new S/W algorithms/ libraries developed.2)Building blocks for complex multi-threaded applications.}, journal={2023 IEEE INTERNATIONAL SYMPOSIUM ON PERFORMANCE ANALYSIS OF SYSTEMS AND SOFTWARE, ISPASS}, author={Nagabhiru, Mahita and Byrd, Greg}, year={2023}, pages={322–324} } @article{regan_eastwood_nagabhiru_mueller_2019, title={Automatically Translating Quantum Programs from a Subset of Common Gates to an Adiabatic Representation}, volume={11497}, ISBN={["978-3-030-21499-9"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-030-21500-2_9}, abstractNote={Adiabatic computing with two degrees of freedom of 2-local Hamiltonians has been theoretically shown to be equivalent to the gate model of universal quantum computing. But today’s quantum annealers, namely D-Wave’s 2000Q platform, only provide a 2-local Ising Hamiltonian abstraction with a single degree of freedom. This raises the question what subset of gate programs can be expressed as quadratic unconstrained binary problems (QUBOs) on the D-Wave. The problem is of interest because gate-based quantum platforms are currently limited to 20 qubits while D-Wave provides 2,000 qubits. However, when transforming entire gate circuits into QUBOs, additional qubits will be required. The objective of this work is to determine a subset of quantum gates suitable for transformation into single-degree 2-local Ising Hamiltonians under a common qubit base representation such that they comprise a compound circuit suitable for pure quantum computation, i.e., without having to switch between classical and quantum computing for different bases. To this end, this work contributes, for the first time, a fully automated method to translate quantum gate circuits comprised of a subset of common gates expressed as an IBM Qiskit program to single-degree 2-local Ising Hamiltonians, which are subsequently embedded in the D-Wave 2000Q chimera graph. These gate elements are placed in the chimera graph and augmented by constraints that enforce inter-gate logical relationships, resulting in an annealer embedding that completely characterizes the overall gate circuit. Annealer embeddings for several example quantum gate circuits are then evaluated on D-Wave 2000Q hardware.}, journal={REVERSIBLE COMPUTATION (RC 2019)}, author={Regan, Malcolm and Eastwood, Brody and Nagabhiru, Mahita and Mueller, Frank}, year={2019}, pages={146–161} }