@article{khetawat_jain_bhatele_mueller_2024, title={Predicting GPUDirect Benefits for HPC Workloads}, ISBN={["979-8-3503-6308-1"]}, ISSN={["1066-6192"]}, DOI={10.1109/PDP62718.2024.00020}, journal={2024 32ND EUROMICRO INTERNATIONAL CONFERENCE ON PARALLEL, DISTRIBUTED AND NETWORK-BASED PROCESSING, PDP 2024}, author={Khetawat, Harsh and Jain, Nikhil and Bhatele, Abhinav and Mueller, Frank}, year={2024}, pages={88–97} } @article{zhang_mueller_2022, title={CLAIRE: Enabling Continual Learning for Real-time Autonomous Driving with a Dual-head Architecture}, ISSN={["2770-162X"]}, DOI={10.1109/ISORC52572.2022.9812816}, abstractNote={Autonomous vehicles rely on a pre-trained object detector to perceive surroundings. However, when never seen before scenarios are encountered, late decisions may result in hard braking due to perceived threats. Image sequences leading to such a situation provide the potential to learn and improve over time. Yet instant re-training on board with all prior training data is infeasible given computational, storage and power constraints. What’s more, exposure of a pre-trained CNN to only images of the new scenario is known to result in “catastrophic forgetting” for already learned features.This work makes several contributions: A novel lightweight dual-head detection network architecture is proposed to overcome forgetting and to support fast on-board continual learning on small sets of new images and assesses the feasibility of continual learning methods for autonomous driving. A sensitivity study on the quality and quantity of continually learned images for our dual-head technique is performed, including an assessment of its real-time suitability. Experiments show that our method’s accuracy is improved by up to 13% and performance increases by 5.8X over a state-of-the-art continual learning framework. This makes it suitable for autonomous driving scenarios with real-time constraints. Source code is made available via Github.}, journal={2022 IEEE 25TH INTERNATIONAL SYMPOSIUM ON REAL-TIME DISTRIBUTED COMPUTING (ISORC 2022)}, author={Zhang, Hao and Mueller, Frank}, year={2022}, pages={51–60} } @article{wilson_mueller_pakin_2022, title={Combining Hard and Soft Constraints in Quantum Constraint-Satisfaction Systems}, ISSN={["2167-4329"]}, DOI={10.1109/SC41404.2022.00018}, abstractNote={This work presents a generalization of NchooseK, a constraint satisfaction system designed to target both quantum circuit devices and quantum annealing devices. Previously, NchooseK supported only hard constraints, which made it suitable for expressing problems in NP (e.g., 3-SAT) but not NP-hard problems (e.g., minimum vertex cover). In this paper we show how support for soft constraints can be added to the model and implementation, broadening the classes of problems that can be expressed elegantly in NchooseK without sacrificing portability across different quantum devices. Through a set of examples, we argue that this enhanced version of NchooseK enables problems to be expressed in a more concise, less error-prone manner than if these problems were encoded manually for quantum execution. We include an empirical evaluation of performance, scalability, and fidelity on both a large IBM Q system and a large D- Wave system.}, journal={SC22: INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS}, author={Wilson, Ellis and Mueller, Frank and Pakin, Scott}, year={2022} } @article{cucinotta_mueller_simmhan_2022, title={Guest editorial: Special issue on the 2020 IEEE symposium on real-time distributed computing (ISORC)}, volume={124}, ISSN={["1873-6165"]}, DOI={10.1016/j.sysarc.2022.102437}, journal={JOURNAL OF SYSTEMS ARCHITECTURE}, author={Cucinotta, Tommaso and Mueller, Frank and Simmhan, Yogesh}, year={2022}, month={Mar} } @article{behera_wan_mueller_wolf_klasky_2022, title={P-ckpt: Coordinated Prioritized Checkpointing}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS53621.2022.00049}, abstractNote={Good prediction accuracy and adequate lead time to failure are key to the success of failure-aware Check-point/Restart (C/R) models on current and future large-scale High-Performance Computing (HPC) systems. This paper develops a novel checkpointing technique, called p-ckpt, that aims to maintain the performance efficiency of failure-aware C/R models even when failures are predicted with a small lead time. The p-ckpt technique is developed for HPC systems with multi-level memory systems to prioritize checkpoints from vulnerable nodes (nodes with predicted failure) in the event of failure prediction. It applies coordination among the nodes within an application so that vulnerable nodes' checkpoint data is stored to the Parallel File System (PFS) first by assigning priorities based on the lead time to failure. Vulnerable nodes thus have low-latency access on the critical path to the PFS before any failure happens. Further, we create the hybrid p-ckpt model by integrating Live Migration (LM) because of its cost-effectiveness and to reduce checkpoint frequency. Our hybrid p-ckpt C/R model considers prediction lead time and checkpoint latency to the PFS to decide on a feasible proactive action such as p-ckpt and LM. Simulations of six real-world applications for the Summit supercomputer show a ≈53-65% reduction in overhead due to the hybrid p-ckpt model compared to a ≈31-61% reduction in a state-of-the-art solution. We assess our C/R models against multiple failure distributions and consider lead time variability and failure prediction accuracy. Based on this evaluation and assessment, we discuss the trade-offs of using these models and their impact on application overhead.}, journal={2022 IEEE 36TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2022)}, author={Behera, Subhendu and Wan, Lipeng and Mueller, Frank and Wolf, Matthew and Klasky, Scott}, year={2022}, pages={436–446} } @article{mcdonald_mueller_2022, title={T-SYS: Timed-Based System Security for Real-Time Kernels}, ISSN={["2375-8317"]}, DOI={10.1109/ICCPS54341.2022.00029}, abstractNote={The increasing proliferation of cyber-physical systems in a multitude of applications presents a pressing need for effective methods of securing such devices. Many such systems are subject to tight timing constraints, which are poorly suited to traditional security methods due to the large runtime overhead and execution time variation introduced. However, the regular (and well documented) timing specifications of real-time systems open up new avenues with which such systems can be secured. This paper contributes T-SYS, a timed-system method of detecting intrusions into real-time systems via timing anomalies. A prototype implementation of T-SYS is integrated into a commercial real-time operating system (RTOS) in order to demonstrate its feasibility. Further, a compiler-based tool is developed to realize a T-SYS implementation with elastic timing bounds. This tool sup-ports integration of T-SYS protection into applications as well as the RTOS the kernel itself. Results on an ARM hardware platform with benchmark tasks including those drawn from an open-source UAV code base compare T-SYS with another method of timing-based intrusion detection and assess its effectiveness in terms of detecting attacks as they intrude a system.}, journal={2022 13TH ACM/IEEE INTERNATIONAL CONFERENCE ON CYBER-PHYSICAL SYSTEMS (ICCPS 2022)}, author={McDonald, Brayden and Mueller, Frank}, year={2022}, pages={247–258} } @article{bahmani_xing_krishnan_ray_mueller_alavi_tsao_snyder_pan_2021, title={Hummingbird: efficient performance prediction for executing genomic applications in the cloud}, volume={37}, ISSN={["1460-2059"]}, DOI={10.1093/bioinformatics/btab161}, abstractNote={Abstract}, number={17}, journal={BIOINFORMATICS}, author={Bahmani, Amir and Xing, Ziye and Krishnan, Vandhana and Ray, Utsab and Mueller, Frank and Alavi, Amir and Tsao, Philip S. and Snyder, Michael P. and Pan, Cuiping}, year={2021}, month={Sep}, pages={2537–2543} } @article{wilson_mueller_pakin_2021, title={Mapping Constraint Problems onto Quantum Gate and Annealing Devices}, DOI={10.1109/QCS54837.2021.00016}, abstractNote={This work presents NchooseK, a unified programming model for constraint satisfaction problems that can be mapped to both quantum circuit and annealing devices through Quadratic Unconstrained Binary Operators (QUBOs). Our mapping provides an approachable and effective way to program both types of quantum computers. We provide examples of NchooseK being used.}, journal={PROCEEDINGS OF SECOND INTERNATIONAL WORKSHOP ON QUANTUM COMPUTING SOFTWARE (QCS 2021)}, author={Wilson, Ellis and Mueller, Frank and Pakin, Scott}, year={2021}, pages={110–117} } @article{pan_mueller_2021, title={NUMA-aware memory coloring for multicore real-time systems}, volume={118}, ISSN={["1873-6165"]}, DOI={10.1016/j.sysarc.2021.102188}, abstractNote={Non-uniform memory access (NUMA) systems are characterized by varying memory latencies so that execution times may become unpredictable in a multicore real-time system. This results in overly conservative scheduling with low utilization due to loose bounds on a task's worst-case execution time (WCET). This work contributes a controller/node-aware memory coloring (CAMC) allocator inside the Linux kernel for the entire address space to reduce access conflicts and latencies by isolating tasks from one another. CAMC improves timing predictability and performance over Linux' buddy allocator and prior coloring methods. It provides core isolation with respect to banks and memory controllers for real-time systems. This work is the first to consider multiple memory controllers in real-time systems, combine them with bank coloring, and assess its performance on a NUMA architecture, to the best of our knowledge.}, journal={JOURNAL OF SYSTEMS ARCHITECTURE}, author={Pan, Xing and Mueller, Frank}, year={2021}, month={Sep} } @article{fustero_palmtag_mueller_2021, title={Quantum Annealing Stencils with Applications to Fuel Loading of a Nuclear Reactor}, DOI={10.1109/QCE52317.2021.00044}, abstractNote={A method for mapping quadratic unconstrained binary optimizations expressed as nearest neighbor stencils onto contemporary quantum annealing machines is developed. The method is shown to be scalable in providing higher utilization of annealing hardware resources than prior work. Applying the technique to the problem of determining an effective fuel loading pattern for nuclear reactors shows that densely mapped quantum stencils result in higher fidelity solutions of optimization problems then the sparser default solutions. These results are likely to generalize to quadratic unconstrained binary optimizations that can be expressed as dense quantum stencils, thereby improving optimization results obtained from noisy quantum devices.}, journal={2021 IEEE INTERNATIONAL CONFERENCE ON QUANTUM COMPUTING AND ENGINEERING (QCE 2021) / QUANTUM WEEK 2021}, author={Fustero, Joseph and Palmtag, Scott and Mueller, Frank}, year={2021}, pages={265–275} } @article{das_mueller_rountree_2021, title={Systemic Assessment of Node Failures in HPC Production Platforms}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS49936.2021.00035}, abstractNote={Production HPC clusters endure failures reducing computational capability and resource availability. Despite the presence of various failure prediction schemes for large-scale computing systems, a comprehensive understanding of how nodes fail considering various components and layers of the system is required for sustained resilience. This work performs a holistic diagnosis of node failures using a measurement-driven approach on contemporary system logs that can help vendors and system administrators support exascale resilience.Our work shows that external environmental influence is not strongly correlated with node failures in terms of the root cause. Though hardware and software faults trigger failures, the underlying root cause often lies in the application malfunctioning causing the system to fail. Furthermore, lead time enhancements are feasible for nodes showing fail slow characteristics. This study excavates such helpful empirical observations, which could facilitate better failure handling in production systems.}, journal={2021 IEEE 35TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Das, Anwesha and Mueller, Frank and Rountree, Barry}, year={2021}, pages={267–276} } @article{mittal_mueller_2021, title={T-Pack: Timed Network Security for Real Time Systems}, ISSN={["2375-5261"]}, DOI={10.1109/ISORC52013.2021.00014}, abstractNote={Network communication between real-time control systems raises system vulnerability to malware attacks over the network. Such attacks not only result in alteration of system behavior but also incur timing dilation due to executing injected code or, in case of network attacks, to dropped, added, rerouted, or modified packets. This work proposes to detect intrusion based on time dilation induced by time delays within the network potentially resulting in system malfunctioning due to missed deadlines. A new method of timed packet protection, T-Pack, analyzes end-to-end transmission times of packets and detects a compromised system or network based on deviation of observed time from the expected time on end nodes, well in advance of a task's deadline. First, the Linux network stack is extended with timing information maintained within the kernel and further embedded within packets for TCP and UDP communication. Second, real-time application scenarios are analyzed in terms of their susceptibility to malware attacks. Results are evaluated on a distributed system of embedded platforms running a Preempt RT Linux kernel to demonstrate its real-time capabilities.}, journal={2021 IEEE 24TH INTERNATIONAL SYMPOSIUM ON REAL-TIME DISTRIBUTED COMPUTING (ISORC 2021)}, author={Mittal, Swastik and Mueller, Frank}, year={2021}, pages={20–28} } @article{das_mueller_rountree_2020, title={Aarohi: Making Real-Time Node Failure Prediction Feasible}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS47924.2020.00115}, abstractNote={Large-scale production systems are well known to encounter node failures, which affect compute capacity and energy. Both in HPC systems and enterprise data centers, combating failures is becoming challenging with increasing hardware and software complexity. Several data mining solutions of logs have been investigated in the context of anomaly detection in such systems. However, with subsequent proactive failure mitigation, the existing log mining solutions are not sufficiently fast for real-time anomaly detection. Machine learning (ML)-based training can produce high accuracy but the inference scheme needs to be enhanced with rapid parsers to assess anomalies in real-time. This work tackles online anomaly prediction in computing systems by exploiting context free grammar-based rapid event analysis.We present our framework Aarohi1, which describes an effective way to predict failures online. Aarohi is designed to be generic and scalable making it suitable as a real-time predictor. Aarohi obtains more than 3 minutes lead times to node failures with an average of 0.31 msecs prediction time for a chain length of 18. The overall improvement obtained w.r.t. the existing state-of-the-art is over a factor of 27.4×. Our compiler-based approach provides new research directions for lead time optimization with a significant prediction speedup required for the deployment of proactive fault tolerant solutions in practice.}, journal={2020 IEEE 34TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM IPDPS 2020}, author={Das, Anwesha and Mueller, Frank and Rountree, Barry}, year={2020}, pages={1092–1101} } @article{wang_yu_qiu_jin_mueller_2020, title={BarrierFinder: recognizing ad hoc barriers}, volume={25}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-020-09862-3}, number={6}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Wang, Tao and Yu, Xiao and Qiu, Zhengyi and Jin, Guoliang and Mueller, Frank}, year={2020}, month={Nov}, pages={4676–4706} } @article{wilson_singh_mueller_2020, title={Just-in-time Quantum Circuit Transpilation Reduces Noise}, DOI={10.1109/QCE49297.2020.00050}, abstractNote={Running quantum programs is fraught with challenges on on today's noisy intermediate scale quantum (NISQ) devices. Many of these challenges originate from the error characteristics that stem from rapid decoherence and noise during measurement, qubit connections, crosstalk, the qubits themselves, and transformations of qubit state via gates. Not only are qubits not “created equal”, but their noise level also changes over time. IBM is said to calibrate their quantum systems once per day and reports noise levels (errors) at the time of such calibration. This information is subsequently used to map circuits to higher quality qubits and connections up to the next calibration point. This work provides evidence that there is room for improvement over this daily calibration cycle. It contributes a technique to measure noise levels (errors) related to qubits immediately before executing one or more sensitive circuits and shows that just-in-time noise measurements can benefit late physical qubit mappings. With this just-in-time recalibrated transpilation, the fidelity of results is improved over IBM's default mappings, which only uses their daily calibrations. The framework assess two major sources of noise, namely readout errors (measurement errors) and two-qubit gate/connection errors. Experiments indicate that the accuracy of circuit results improves by 3–304% on average and up to 400% with on-the-fly circuit mappings based on error measurements just prior to application execution.}, journal={IEEE INTERNATIONAL CONFERENCE ON QUANTUM COMPUTING AND ENGINEERING (QCE20)}, author={Wilson, Ellis and Singh, Sudhakar and Mueller, Frank}, year={2020}, pages={345–355} } @inproceedings{patil_mueller_ionkov_lee_lang_2020, place={New York}, title={Symbiotic HW Cache and SW DTLB Prefetching for DRAM/NVM Hybrid Memory}, ISBN={9781728192383}, url={http://dx.doi.org/10.1109/MASCOTS50786.2020.9285963}, DOI={10.1109/MASCOTS50786.2020.9285963}, abstractNote={The introduction of NVDIMM memory devices has encouraged the use of DRAM/NVM based hybrid memory systems to increase the memory-per-core ratio in compute nodes and obtain possible energy and cost benefits. However, Non-Volatile Memory (NVM) is slower than DRAM in terms of read/write latency. This difference in performance will adversely affect memory-bound applications. Traditionally, data prefetching at the hardware level has been used to increase the number of cache hits to mitigate performance degradation. However, software (SW) prefetching has not been used effectively to reduce the effects of high memory access latencies. Also, the current cache hierarchy and hardware (HW) prefetching are not optimized for a hybrid memory system. We hypothesize that HW and SW prefetching can complement each other in placing data in caches and the Data Translation Look-aside Buffer (DTLB) prior to their references, and by doing so adaptively, highly varying access latencies in a DRAM/NVM hybrid memory system are taken into account. This work contributes an adaptive SW prefetch method based on the characterization of read/write/unroll prefetch distances for NVM and DRAM. Prefetch performance is characterized via custom benchmarks based on STREAM2 specifications in a multicore MPI runtime environment and compared to the performance of the standard SW prefetch pass in GCC. Furthermore, the effects of HW prefetching on kernels executing on hybrid memory system are evaluated. Experimental results indicate that SW prefetching targeted to populate the DTLB results in up to 26% performance improvement when symbiotically used in conjunction with HW prefetching, as opposed to only HW prefetching. Based on our findings, changes to GCC's prefetch-loop-arrays compiler pass are proposed to take advantage of DTLB prefetching in a hybrid memory system for kernels that are frequently used in HPC applications.}, booktitle={2020 28th International Symposium on Modeling, Analysis, and Simulation of Computer and Telecommunication Systems (MASCOTS)}, publisher={IEEE}, author={Patil, Onkar and Mueller, Frank and Ionkov, Latchesar and Lee, Jason and Lang, Michael}, year={2020}, month={Nov}, pages={1–8} } @article{ferriter_mueller_bahmani_pan_2020, title={VCFC: Structural and Semantic Compression and Indexing of Genetic Variant Data}, ISSN={["2156-1133"]}, DOI={10.1109/BIBM49941.2020.9313221}, abstractNote={Personalized genomic datasets are growing in size at an accelerating pace presenting a dilemma between the need for fast retrieval requiring “near data” and cost of storage, which decreases for “distant media” with larger capacity but longer access time. Instead of database technology, the bioinformatics community has developed an industry standard for compressing and indexing of genetic variant files that store the difference between a person’s genome to a human reference genome. These standardizations rely on generic data compression schemes.This work contributes novel domain-specific compression and indexing algorithms that retain the structure and semantics of genetic variation data while supporting common query patterns. A line-based run-length partial compression technique for variant genotype data using a novel indexing strategy is developed and shown to perform well on large sample sets compared to the industry standard. The evaluation over genomic datasets indicates compression at a comparable size for our data representation while resulting in speedup of ˇ2X in indexed queries compared to the industry standard. This underlines that our representation could replace existing standards resulting in reduced computational cost at equivalent storage size.}, journal={2020 IEEE INTERNATIONAL CONFERENCE ON BIOINFORMATICS AND BIOMEDICINE}, author={Ferriter, Kyle and Mueller, Frank and Bahmani, Amir and Pan, Cuiping}, year={2020}, pages={200–203} } @article{regan_eastwood_nagabhiru_mueller_2019, title={Automatically Translating Quantum Programs from a Subset of Common Gates to an Adiabatic Representation}, volume={11497}, ISBN={["978-3-030-21499-9"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-030-21500-2_9}, abstractNote={Adiabatic computing with two degrees of freedom of 2-local Hamiltonians has been theoretically shown to be equivalent to the gate model of universal quantum computing. But today’s quantum annealers, namely D-Wave’s 2000Q platform, only provide a 2-local Ising Hamiltonian abstraction with a single degree of freedom. This raises the question what subset of gate programs can be expressed as quadratic unconstrained binary problems (QUBOs) on the D-Wave. The problem is of interest because gate-based quantum platforms are currently limited to 20 qubits while D-Wave provides 2,000 qubits. However, when transforming entire gate circuits into QUBOs, additional qubits will be required. The objective of this work is to determine a subset of quantum gates suitable for transformation into single-degree 2-local Ising Hamiltonians under a common qubit base representation such that they comprise a compound circuit suitable for pure quantum computation, i.e., without having to switch between classical and quantum computing for different bases. To this end, this work contributes, for the first time, a fully automated method to translate quantum gate circuits comprised of a subset of common gates expressed as an IBM Qiskit program to single-degree 2-local Ising Hamiltonians, which are subsequently embedded in the D-Wave 2000Q chimera graph. These gate elements are placed in the chimera graph and augmented by constraints that enforce inter-gate logical relationships, resulting in an annealer embedding that completely characterizes the overall gate circuit. Annealer embeddings for several example quantum gate circuits are then evaluated on D-Wave 2000Q hardware.}, journal={REVERSIBLE COMPUTATION (RC 2019)}, author={Regan, Malcolm and Eastwood, Brody and Nagabhiru, Mahita and Mueller, Frank}, year={2019}, pages={146–161} } @article{wang_yu_qiu_jin_mueller_2019, title={BARRIERFINDER: Recognizing Ad Hoc Barriers}, ISSN={["1063-6773"]}, DOI={10.1109/ICSME.2019.00049}, abstractNote={Ad hoc synchronizations are pervasive in multi-threaded programs. Due to their diversity and complexity, understanding the enforced synchronization relationships of ad hoc synchronizations is challenging but crucial to multi-threaded program development and maintenance. Existing techniques can partially detect primitive ad hoc synchronizations, but they cannot recognize complete implementations or infer the enforced synchronization relationships. In this paper, we propose a framework to automatically identify complex ad hoc synchronizations in full and infer their synchronization relationships. We instantiate the framework with a tool called BarrierFinder, which features various techniques, including program slicing and bounded symbolic execution, to efficiently explore the interleaving space of ad hoc synchronizations within multi-threaded programs and collect execution traces. BarrierFinder then uses these traces to characterize ad hoc synchronizations into different types with a focus on recognizing barriers. Our evaluation shows that BarrierFinder is both effective and efficient in doing this, and BarrierFinder is also helpful for programmers to understand the correctness of their implemented ad hoc synchronizations.}, journal={2019 IEEE INTERNATIONAL CONFERENCE ON SOFTWARE MAINTENANCE AND EVOLUTION (ICSME 2019)}, author={Wang, Tao and Yu, Xiao and Qiu, Zhengyi and Jin, Guoliang and Mueller, Frank}, year={2019}, pages={323–327} } @article{rezaei_khetawat_patil_mueller_hargrove_roman_2019, title={End-to-End Resilience for HPC Applications}, volume={11501}, ISBN={["978-3-030-20655-0"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-030-20656-7_14}, abstractNote={A plethora of resilience techniques have been investigated to protect application kernels. If, however, such techniques are combined and they interact across kernels, new vulnerability windows are created. This work contributes the idea of end-to-end resilience by protecting windows of vulnerability between kernels guarded by different resilience techniques. It introduces the live vulnerability factor (LVF), a new metric that quantifies any lack of end-to-end protection for a given data structure. The work further promotes end-to-end application protection across kernels via a pragma-based specification for diverse resilience schemes with minimal programming effort. This lifts the data protection burden from application programmers allowing them to focus solely on algorithms and performance while resilience is specified and subsequently embedded into the code through the compiler/library and supported by the runtime system. In experiments with case studies and benchmarks, end-to-end resilience has an overhead over kernel-specific resilience of less than $$3\%$$ on average and increases protection against bit flips by a factor of three to four.}, journal={HIGH PERFORMANCE COMPUTING, ISC HIGH PERFORMANCE 2019}, author={Rezaei, Arash and Khetawat, Harsh and Patil, Onkar and Mueller, Frank and Hargrove, Paul and Roman, Eric}, year={2019}, pages={271–290} } @article{wang_jain_beckingsale_boehme_mueller_gamblin_2019, title={FuncyTuner: Auto-tuning Scientific Applications With Per-loop Compilation}, ISSN={["0190-3918"]}, DOI={10.1145/3337821.3337842}, abstractNote={The de facto compilation model for production software compiles all modules of a target program with a single set of compilation flags, typically 02 or 03. Such a per-program compilation strategy may yield sub-optimal executables since programs often have multiple hot loops with diverse code structures and may be better optimized with a per-region compilation model that assembles an optimized executable by combining the best per-region code variants. In this paper, we demonstrate that a naïve greedy approach to per-region compilation often degrades performance in comparison to the 03 baseline. To overcome this problem, we contribute a novel per-loop compilation framework, FuncyTuner, which employs lightweight profiling to collect per-loop timing information, and then utilizes a space-focusing technique to construct a performant executable. Experimental results show that FuncyTuner can reliably improve performance of modern scientific applications on several multi-core architectures by 9.2% to 12.3% and 4.5% to 10.7%(geometric mean, up to 22% on certain program) in comparison to the 03 baseline and prior work, respectively.}, journal={PROCEEDINGS OF THE 48TH INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING (ICPP 2019)}, author={Wang, Tao and Jain, Nikhil and Beckingsale, David and Boehme, David and Mueller, Frank and Gamblin, Todd}, year={2019} } @article{khetawat_atrey_li_mueller_pakin_2019, title={Implementing NChooseK on IBM Q Quantum Computer Systems}, volume={11497}, ISBN={["978-3-030-21499-9"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-030-21500-2_13}, abstractNote={This work contributes a generalized model for quantum computation called NChooseK. NChooseK is based on a single parametrized primitive suitable to express a variety of problems that cannot be solved efficiently using classical computers but may admit an efficient quantum solution. We implement a code generator that, given arbitrary parameters for N and K, generates code suitable for execution on IBM Q quantum hardware. We assess the performance of the code generator, limitations in the size of circuit depth and number of gates, and propose optimizations. We identify future work to improve efficiency and applicability of the NChooseK model.}, journal={REVERSIBLE COMPUTATION (RC 2019)}, author={Khetawat, Harsh and Atrey, Ashlesha and Li, George and Mueller, Frank and Pakin, Scott}, year={2019}, pages={209–223} } @inproceedings{patil_ionkov_lee_mueller_lang_2019, title={Performance characterization of a DRAM-NVM hybrid memory architecture for HPC applications using intel optane DC persistent memory modules}, ISBN={9781450372060}, url={http://dx.doi.org/10.1145/3357526.3357541}, DOI={10.1145/3357526.3357541}, abstractNote={Non-volatile, byte-addressable memory (NVM) has been introduced by Intel in the form of NVDIMMs named Intel® Optane™ DC PMM. This memory module has the ability to persist the data stored in it without the need for power. This expands the memory hierarchy into a hybrid memory system due the differences in access latency and memory bandwidth from DRAM, which has been the predominant byte-addressable main memory technology. The Optane DC memory modules have up to 8x the capacity of DDR4 DRAM modules which can expand the byte-address space up to 6 TB per node. Many applications can now scale up the their problem size given such a memory system. We evaluate the capabilities of this DRAM-NVM hybrid memory system and its impact on High Performance Computing (HPC) applications. We characterize the Optane DC in comparison to DDR4 DRAM with a STREAM-like custom benchmark and measure the performance for HPC mini-apps like VPIC, SNAP, LULESH and AMG under different configurations of Optane DC PMMs. We find that Optane-only executions are slower in terms of execution time than DRAM-only and Memory-mode executions by a minimum of 2 to 16% for VPIC and maximum of 6x for LULESH.}, booktitle={Proceedings of the International Symposium on Memory Systems - MEMSYS '19}, publisher={ACM Press}, author={Patil, Onkar and Ionkov, Latchesar and Lee, Jason and Mueller, Frank and Lang, Michael}, year={2019} } @article{mueller_byrd_dreher_2019, title={Programming Quantum Computers: A Primer with IBM Q and D-Wave Exercises}, DOI={10.1145/3293883.3302578}, abstractNote={This tutorial provides a hands-on introduction to quantum computing. It will feature the three pillars, architectures, programming, and algorithms/applications of quantum computing. Its focus is on the applicability of problems to quantum computing from a practical point, with only the necessary foundational coverage of the physics and theoretical aspects to understand quantum computing. Simulation software will be utilized complemented by access to actual quantum computers to prototype problem solutions. This should develop a better understanding of how problems are transformed into quantum algorithms and what programming language support is best suited for a given application area. As a first of its kind, to the best of our knowledge, the tutorial includes hands-on programming experience with IBM Q and D-Wave hardware.}, journal={PROCEEDINGS OF THE 24TH SYMPOSIUM ON PRINCIPLES AND PRACTICE OF PARALLEL PROGRAMMING (PPOPP '19)}, author={Mueller, Frank and Byrd, Greg and Dreher, Patrick}, year={2019}, pages={451–451} } @article{pan_mueller_2019, title={The Colored Refresh Server for DRAM}, ISSN={["1555-0885"]}, DOI={10.1109/ISORC.2019.00015}, abstractNote={Bounding each task's worst-case execution time (WCET) accurately is essential for real-time systems to determine if all deadlines can be met. Yet, access latencies to Dynamic Random Access Memory (DRAM) vary significantly due to DRAM refresh, which blocks access to memory cells. Variations further increase as DRAM density grows. This work contributes the “Colored Refresh Server” (CRS), a uniprocessor scheduling paradigm that partitions DRAM in two distinctly colored groups such that refreshes of one color occur in parallel to the execution of real-time tasks of the other color. By executing tasks in phase with periodic DRAM refreshes with opposing colors, memory requests no longer suffer from refresh interference. Experimental results confirm that refresh overhead is completely hidden and memory throughput enhanced.}, journal={2019 IEEE 22ND INTERNATIONAL SYMPOSIUM ON REAL-TIME DISTRIBUTED COMPUTING (ISORC 2019)}, author={Pan, Xing and Mueller, Frank}, year={2019}, pages={27–34} } @article{pan_mueller_2019, title={The Colored Refresh Server for DRAM}, ISSN={["1052-8725"]}, DOI={10.1109/RTSS46320.2019.00023}, abstractNote={Bounding each task’s worst-case execution time (WCET) accurately is essential for real-time systems to determine if all deadlines can be met. Yet, access latencies to Dynamic Random Access Memory (DRAM) vary significantly due to DRAM refresh, which blocks access to memory cells. Variations further increase as DRAM density grows. This work contributes the "Colored Refresh Server" (CRS), a uniprocessor scheduling paradigm that partitions DRAM in two distinctly colored groups such that refreshes of one color occur in parallel to the execution of real-time tasks of the other color. By executing tasks in phase with periodic DRAM refreshes with opposing colors, memory requests no longer suffer from refresh interference. Experimental results confirm that refresh overhead is completely hidden and memory throughput enhanced.}, journal={2019 IEEE 40TH REAL-TIME SYSTEMS SYMPOSIUM (RTSS 2019)}, author={Pan, Xing and Mueller, Frank}, year={2019}, pages={146–153} } @article{gholkar_mueller_rountree_2019, title={Uncore Power Scavenger: A Runtime for Uncore Power Conservation on HPC Systems}, DOI={10.1145/3295500.3356150}, abstractNote={The US Department of Energy (DOE) has set a power target of 20-30MW on the first exascale machines. To achieve one exaflop under this power constraint, it is necessary to minimize wasteful consumption of power while striving to improve performance. Toward this end, we investigate uncore frequency scaling (UFS) as a potential knob for reducing the power footprint of HPC jobs. We propose Uncore Power Scavenger (UPSCavenger), a runtime system that dynamically detects phase changes and automatically sets the best uncore frequency for every phase to save power without significant impact on performance. Our experimental evaluations on a cluster show that UPSCavenger achieves up to 10% energy savings with under 1% slowdown. It achieves 14% energy savings with the worst case slowdown of 5.5%. We also show that UPSCavenger achieves up to 20% speedup and proportional energy savings compared to Intel's RAPL with equivalent power usage making it a viable solution even for power-constrained computing.}, journal={PROCEEDINGS OF SC19: THE INTERNATIONAL CONFERENCE FOR HIGH PERFORMANCE COMPUTING, NETWORKING, STORAGE AND ANALYSIS}, author={Gholkar, Neha and Mueller, Frank and Rountree, Barry}, year={2019} } @article{qian_mueller_2018, title={A Failure Recovery Protocol for Software-Defined Real-Time Networks}, volume={37}, ISSN={["1937-4151"]}, DOI={10.1109/TCAD.2018.2857299}, abstractNote={In a distributed computing environment, real-time tasks communicate via a network infrastructure whose stability significantly impacts timing predictability. Network stability includes two aspects. First, the network has to guarantee the deadline requirements of real-time message transmissions in the absence of network failures. Second, the network needs to support dynamic recovery when network failures occur. This paper generalizes previous static routing approaches, which address the first aspect of the network stability, by developing a dynamic failure recovery policy and a protocol to address the second aspect of the network stability. We derive new real-time forwarding paths without compromising the capability of network devices to guarantee deadlines of concurrent real-time transmissions. We implement this mechanism on a network simulation platform and evaluate it on real hardware in a local cluster to demonstrate its feasibility and effectiveness. Experiments confirm the ability to bound recovery delays based on the network parameters.}, number={11}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Qian, Tao and Mueller, Frank}, year={2018}, month={Nov}, pages={2222–2232} } @article{bahmani_mueller_2018, title={Chameleon: Online Clustering of MPI Program Traces}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS.2018.00119}, abstractNote={The data explosion in scientific computing applications continues to fuel increasing demand for computational power. Understanding application behavior in this context becomes essential to determine shortcomings, e.g., by collecting detailed information with tracing toolsets. This work considers parallel applications using the SPMD (single program multiple data) paradigm that relies on iterative kernels. This characteristic provides an opportunity to empower tracing toolsets with effective machine learning algorithms. One solution is to cluster processes with the same behavior into a group. Instead of collecting performance information from each individual process, this information can be collected from just a set of lead processes, i.e., one lead process per group. This work, called Chameleon, contributes an online, fast, and scalable signature-based clustering algorithm. Unlike related work, namely ScalaTrace V2, that generates the compressed global trace within MPI_Finalize, Chameleon creates this trace incrementally during the execution of applications and only for lead processes. Chameleon also identifies different program phases, clusters processes exhibiting different execution behavior, and creates a compressed global trace file on-the-fly, all incrementally at interim execution points of applications. The resulting system combines low overhead at the clustering level a lower time complexity of log (P) than prior work.}, journal={2018 32ND IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Bahmani, Amir and Mueller, Frank}, year={2018}, pages={1102–1112} } @article{kukreti_mueller_2018, title={CloneHadoop: Process Cloning to Reduce Hadoop's Long Tail}, DOI={10.1109/BDCAT.2018.00011}, abstractNote={Recent advances in distributed computing have enabled large-scale data processing on high volumes of data with MapReduce. However, the overall runtime of such applications is dependent on the slowest subtask at the tail end of the work queue. Current approaches mitigate this effect by scheduling redundant speculative attempts of straggler tasks. However, speculative attempts have to recompute work already done by the original task. This often prevents late speculations from completing before the straggling task. This work promotes a novel speculation approach via process cloning to avoid redundant computations transparent to users. But speculation requires consistency between task attempts and results in divergent execution of subtasks. The work contributes (1) a process cloning approach for speculative execution, (2) mechanisms to maintain consistency and recover complex components, and (3) an integration of cloning and recovery into Apache Hadoop with optimizations to alleviate resource bottlenecks. In experiments, cloning benefits straggling tasks with runtime reductions of up to 25% on jobs with larger tasks. Our solution scales with different task and problem sizes and is robust across different runtime distributions of straggler tasks.}, journal={2018 IEEE/ACM 5TH INTERNATIONAL CONFERENCE ON BIG DATA COMPUTING APPLICATIONS AND TECHNOLOGIES (BDCAT)}, author={Kukreti, Sarthak and Mueller, Frank}, year={2018}, pages={11–20} } @article{damschen_mueller_henkel_2018, title={Co-Scheduling on Fused CPU-GPU Architectures With Shared Last Level Caches}, volume={37}, ISSN={["1937-4151"]}, DOI={10.1109/TCAD.2018.2857042}, abstractNote={Fused CPU-GPU architectures integrate a CPU and general-purpose GPU on a single die. Recent fused architectures even share the last level cache (LLC) between CPU and GPU. This enables hardware-supported byte-level coherency. Thus, CPU and GPU can execute computational kernels collaboratively, but novel methods to co-schedule work are required. This paper contributes three dynamic co-scheduling methods. Two of our methods implement workers that autonomously acquire work from a common set of independent work items (similar to bag-of-tasks scheduling). The third method, host-side profiling, uses a fraction of the total work of a kernel to determine a ratio of how to distribute work to CPU and GPU based on profiling. The resulting ratio is used for the following executions of the same kernel. Our methods are realized using OpenCL 2.0, which introduces fine-grained shared virtual memory (SVM) to allocate coherent memory between CPU and GPU. We port the Rodinia Benchmark Suite, a standard suite for heterogeneous computing, to fine-grained SVM and fused CPU-GPU architectures (Rodinia-SVM). We evaluate the overhead of fine-grained SVM and analyze the suitability of OpenCL 2.0’s new features for co-scheduling. Our host-side profiling method performs competitively to the optimal choice of executing kernels either on CPU or GPU (hypothetical xor-Oracle). On average, it achieves 97% of xor-Oracle’s performance and a $1.43\times$ speedup over using the GPU alone (standard in Rodinia). We show, however, that in most cases it is not beneficial to split the work of a kernel between CPU and GPU compared to exclusively running it on the most suitable single compute device. For a fixed amount of work per device, cache-related stalls can increase by up to $1.75\times$ when both devices are used in parallel instead of exclusively while cache misses remain the same. Thus, not the cost of cache conflicts, but inefficient cache coherence is a major performance bottleneck for current fused CPU-GPU Intel architectures with shared LLC.}, number={11}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Damschen, Marvin and Mueller, Frank and Henkel, Joerg}, year={2018}, month={Nov}, pages={2337–2347} } @article{pan_mueller_2018, title={Controller-Aware Memory Coloring for Multicore Real-Time Systems}, DOI={10.1145/3167132.3167196}, abstractNote={Memory latencies vary in non-uniform memory access (NUMA) systems so that execution times may become unpredictable in a multicore real-time system. This results in overly conservative scheduling with low utilization due to loose bounds on the worst-case execution time (WCET) of tasks. This work contributes a controller/node-aware memory coloring (CAMC) allocator inside the Linux kernel for the entire address space to reduce access conflicts and latencies by isolating tasks from one another. CAMC improves timing predictability and performance over Linux' buddy allocator and prior coloring methods. It provides core isolation with respect to banks and memory controllers for real-time systems. To our knowledge, this work is first to consider multiple memory controllers in real-time systems, combine them with bank coloring, and assess its performance on a NUMA architecture.}, journal={33RD ANNUAL ACM SYMPOSIUM ON APPLIED COMPUTING}, author={Pan, Xing and Mueller, Frank}, year={2018}, pages={584–592} } @article{das_mueller_siegel_vishnu_2018, title={Desh: Deep Learning for System Health Prediction of Lead Times to Failure in HPC}, DOI={10.1145/3208040.3208051}, abstractNote={Today's large-scale supercomputers encounter faults on a daily basis. Exascale systems are likely to experience even higher fault rates due to increased component count and density. Triggering resilience-mitigating techniques remains a challenge due to the absence of well defined failure indicators. System logs consist of unstructured text that obscures essential system health information contained within. In this context, efficient failure prediction via log mining can enable proactive recovery mechanisms to increase reliability. This work aims to predict node failures that occur in supercomputing systems via long short-term memory (LSTM) networks that exploit recurrent neural networks (RNNs). Our framework, Desh1 (Deep Learning for System Health), diagnoses and predicts failures with short lead times. Desh identifies failure indicators with enhanced training and classification for generic applicability to logs from operating systems and software components without the need to modify any of them. Desh uses a novel three-phase deep learning approach to (1) train to recognize chains of log events leading to a failure, (2) re-train chain recognition of events augmented with expected lead times to failure, and (3) predict lead times during testing/inference deployment to predict which specific node fails in how many minutes. Desh obtains as high as 3 minutes average lead time with no less than 85% recall and 83% accuracy to take proactive actions on the failing nodes, which could be used to migrate computation to healthy nodes.}, journal={HPDC '18: PROCEEDINGS OF THE 27TH INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE PARALLEL AND DISTRIBUTED COMPUTING}, author={Das, Anwesha and Mueller, Frank and Siegel, Charles and Vishnu, Abhinav}, year={2018}, pages={40–51} } @article{das_iyengar_mueller_2018, title={KeyValueServe(dagger): Design and performance analysis of a multi-tenant data grid as a cloud service}, volume={30}, ISSN={["1532-0634"]}, DOI={10.1002/cpe.4424}, abstractNote={Summary}, number={14}, journal={CONCURRENCY AND COMPUTATION-PRACTICE & EXPERIENCE}, author={Das, Anwesha and Iyengar, Arun and Mueller, Frank}, year={2018}, month={Jul} } @article{gholkar_mueller_rountree_marathe_2018, title={PShiter: Feedback-based Dynamic Power Shiting within HPC Jobs for Performance}, DOI={10.1145/3208040.3208047}, abstractNote={The US Department of Energy (DOE) has set a power target of 20-30MW on the first exascale machines. To achieve one exaFLOPS under this power constraint, it is necessary to manage power intelligently while maximizing performance. Most production-level parallel applications suffer from computational load imbalance across distributed processes due to non-uniform work decomposition. Other factors like manufacturing variation and thermal variation in the machine room may amplify this imbalance. As a result of this imbalance, some processes of a job reach the blocking calls, collectives or barriers earlier and wait for others to reach the same point. This waiting results in a wastage of energy and CPU cycles which degrades application efficiency and performance. We address this problem for power-limited jobs via Power Shifter (PShifter), a dual-level, feedback-based mechanism that intelligently and automatically detects such imbalance and reduces it by dynamically re-distributing a job's power budget across processors to improve the overall performance of the job compared to a naïve uniform power distribution across nodes. In contrast to prior work, PShifter ensures that a given power budget is not violated. At the bottom level of PShifter, local agents monitor and control the performance of processors by actuating different power levels. They reduce power from the processors that incur substantial wait times. At the top level, the cluster agent that has the global view of the system, monitors the job's power consumption and provides feedback on the unused power, which is then distributed across the processors of the same job. Our evaluation on an Intel cluster shows that PShifter achieves performance improvement of up to 21% and energy savings of up to 23% compared to uniform power allocation, outperforms static approaches by up to 40% and 22% for codes with and without phase changes, respectively, and outperforms dynamic schemes by up to 19%. To the best of our knowledge, PShifter is the first approach to transparently and automatically apply power capping non-uniformly across processors of a job in a dynamic manner adapting to phase changes.}, journal={HPDC '18: PROCEEDINGS OF THE 27TH INTERNATIONAL SYMPOSIUM ON HIGH-PERFORMANCE PARALLEL AND DISTRIBUTED COMPUTING}, author={Gholkar, Neha and Mueller, Frank and Rountree, Barry and Marathe, Aniruddha}, year={2018}, pages={106–117} } @article{xu_mueller_2018, title={Work-In-Progress: Making Machine Learning Real-Time Predictable}, ISSN={["1052-8725"]}, DOI={10.1109/RTSS.2018.00029}, abstractNote={Machine learning (ML) on edge computing devices is becoming popular in the industry as a means to make control systems more intelligent and autonomous. The new trend is to utilize embedded edge devices, as they boast higher computational power and larger memories than before, to perform ML tasks that had previously been limited to cloud-hosted deployments. In this work, we assess the real-time predictability and consider data privacy concerns by comparing traditional cloud services with edge-based ones for certain data analytics tasks. We identify the subset of ML problems appropriate for edge devices by investigating if they result in real-time predictable services for a set of widely used ML libraries. We specifically enhance the Caffe library to make it more suitable for real-time predictability. We then deploy ML models with high accuracy scores on an embedded system, exposing it to industry sensor data from the field, to demonstrates its efficacy and suitability for real-time processing.}, journal={2018 39TH IEEE REAL-TIME SYSTEMS SYMPOSIUM (RTSS 2018)}, author={Xu, Hang and Mueller, Frank}, year={2018}, pages={157–160} } @article{rezaei_mueller_hargrove_roman_2017, title={DINO: Divergent node cloning for sustained redundancy in HPC}, volume={109}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2017.06.010}, abstractNote={Complexity and scale of next generation HPC systems pose significant challenges in fault resilience methods such that contemporary checkpoint/restart (C/R) methods that address fail-stop behavior may be insufficient. Redundant computing has been proposed as an alternative at extreme scale. Triple redundancy has an advantage over C/R in that it can also detect silent data corruption (SDC) and then correct results via voting. However, current redundant computing approaches do not repair failed or corrupted replicas. Consequently, SDCs can no longer be detected after a replica failure since the system has been degraded to dual redundancy without voting capability. Hence, a job may have to be aborted if voting uncovers mismatching results between the remaining two replicas. And while replicas are logically equivalent, they may have divergent runtime states during job execution, which presents a challenge to simply creating new replicas dynamically. This problem is addressed by, DIvergent NOde cloning (DINO), a redundant execution environment that quickly recovers from hard failures. DINO consists of a novel node cloning service integrated into the MPI runtime system that solves the problem of consolidating divergent states among replicas on-the-fly. With DINO, after degradation to dual redundancy, a good replica can be quickly cloned so that triple redundancy is restored. We present experimental results over 9 NAS Parallel Benchmarks (NPB), Sweep3D and LULESH. Results confirm the applicability of the approach and the correctness of the recovery process and indicate that DINO can recover from failures nearly instantly. The cloning overhead depends on the process image size that needs to be transferred between source and destination of the clone operation and varies between 5.60 to 90.48 s. Simulation results with our model show that dual redundancy with DINO recovery always outperforms 2x and surpasses 3x redundancy on up to 1 million nodes. To the best of our knowledge, the design and implementation for repairing failed replicas in redundant MPI computing is unprecedented.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Rezaei, Arash and Mueller, Frank and Hargrove, Paul and Roman, Eric}, year={2017}, month={Nov}, pages={350–362} } @article{luo_mueller_carns_jenkins_latham_ross_snyder_2017, title={ScalaIOExtrap: Elastic I/O Tracing and Extrapolation}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2017.45}, abstractNote={Today’s rapid development of supercomputers has caused I/O performance to become a major performance bottleneck for many scientific applications. Trace analysis tools have thus become vital for diagnosing root causes of I/O problems. This work contributes an I/O tracing framework with (a) techniques to gather a set of lossless, elastic I/O trace files for small number of nodes, (b) a mathematical model to analyze trace data and extrapolate it to larger number of nodes, and (c) a replay engine for the extrapolated trace file to verify its accuracy. The traces can in principle be extrapolated even beyond the scale of presentday systems and provide a test if applications scale in terms of I/O. We conducted our experiments on three platforms: a commodity Linux cluster, an IBM BG/Q system, and a discrete event simulation of an IBM BG/P system. We investigate a combination of synthetic benchmarks on all platforms as well as a production scientific application on the BG/Q system. The extrapolated I/O trace replays closely resemble the I/O behavior of equivalent applications in all cases.}, journal={2017 31ST IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Luo, Xiaoqing and Mueller, Frank and Carns, Philip and Jenkins, Jonathan and Latham, Robert and Ross, Robert and Snyder, Shane}, year={2017}, pages={585–594} } @article{bahmani_mueller_2017, title={Scalable communication event tracing via clustering}, volume={109}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2017.06.008}, abstractNote={Communication traces help developers of high-performance computing (HPC) applications understand and improve their codes. When run on large-scale HPC facilities, the scalability of tracing tools becomes a challenge. To address this problem, traces can be clustered into groups of processes that exhibit similar behavior. Instead of collecting trace information of each individual node, it then suffices to collect a trace of a small set of representative nodes, namely one per cluster. However, clustering algorithms themselves need to have low overhead, be scalable, and adapt to application characteristics. We devised an adaptive clustering algorithm for large-scale applications called ACURDION that traces the MPI communication of code with O(log P) time complexity. First, ACURDION identifies the parameters that differ across processes by using a logarithmic algorithm called Adaptive Signature Building. Second, it clusters the processes based on those parameters. Experiments show that collecting traces of just nine nodes/clusters suffices to capture the communication behavior of all nodes for a wide set of HPC benchmarks codes while retaining sufficient accuracy of trace events and parameters. In summary, ACURDION improves trace scalability and automation over prior approaches.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Bahmani, Amir and Mueller, Frank}, year={2017}, month={Nov}, pages={230–244} } @article{gholkar_mueller_rountree_2016, title={A Power-aware Cost Model for HPC Procurement}, ISSN={["2164-7062"]}, DOI={10.1109/ipdpsw.2016.35}, abstractNote={With the supercomputing community headed toward the era of exascale computing, power has become one of the foremost concern. Today's fastest supercomputer, Tianhe-2, already consumes 17.8MW to achieves a peak performance of 33.86PFlops [1]. At least an order of magnitude improvement in performance while maintaining the power envelope is required for exascale. Yet, manufacturing variations are increasingly creating a heterogeneous computing environment, even when identical processing components are deployed, particularly when operating under controlled power ceiling. This work contributes a procurement model to aid in the design of a capability system that achieves maximum performance while considering manufacturing variations. It appropriately partitions a single, compound system budget into the CAPEX (infrastructure cost) and the OPEX (operating power cost). Early results indicate that aggressive infrastructure procurement disregarding such operational needs can lead to severe performance degradation, or significant hidden operating cost will be incurred after procurement.}, journal={2016 IEEE 30TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM WORKSHOPS (IPDPSW)}, author={Gholkar, Neha and Mueller, Frank and Rountree, Barry}, year={2016}, pages={1110–1113} } @inproceedings{qian_xu_zhang_chakrabortty_mueller_xin_2016, title={A resilient software infrastructure for wide-area measurement systems}, DOI={10.1109/pesgm.2016.7741949}, abstractNote={To support the scalability and resilience requirements of distributed Wide-Area Measurement System (WAMS) architectures, we design and implement a software infrastructure to estimate power grid oscillation modes based on real-time data collected from Phasor Measurement Units (PMUs). This estimation algorithm can be deployed on a hierarchical structure of Phasor Data Concentrators (PDCs), which calculate local estimates and communicate with each other to calculate the global estimate. This work contributes a resilient system to WAMS with guarantees for (1) Quality of Service in network delay, (2) network failure tolerance, and (3) self-recoverability. The core component of the infrastructure is a distributed storage system. Externally, the storage system provides a cloud data lookup service with bounded response times and resilience, which decouples the data communication between PMUs, PDCs, and power-grid monitor/control applications. Internally, the storage system organizes PDCs as storage nodes and employs a real-time task scheduler to order data lookup requests so that urgent requests can be served earlier. To demonstrate the resilience of our distributed system, we deploy the system on a (1) virtual platform and (2) bare-metal machines, where we run a distributed algorithm on the basis of the Prony algorithm and the Alternating Directions Method of Multipliers (ADMM) to estimate the electro-mechanical oscillation modes. We inject different failures into the system to study their impact on the estimation algorithm. Our experiments show that temporary failures of a PDC or a network link do not affect the estimation result since the historical PMU data are cached in the storage system and PDCs can obtain the data on demand.}, booktitle={2016 ieee power and energy society general meeting (pesgm)}, author={Qian, T. and Xu, H. and Zhang, J. H. and Chakrabortty, Aranya and Mueller, F. and Xin, Y. F.}, year={2016} } @article{lagadapati_mueller_engelmann_2016, title={Benchmark Generation and Simulation at Extreme Scale}, ISSN={["1550-6525"]}, DOI={10.1109/ds-rt.2016.18}, abstractNote={The path to extreme scale high-performance computing (HPC) poses several challenges related to power, performance, resilience, productivity, programmability, data movement, and data management. Investigating the performance of parallel applications at scale on future architectures and the performance impact of different architectural choices is an important component of HPC hardware/software co-design. Simulations using models of future HPC systems and communication traces from applications running on existing HPC systems can offer an insight into the performance of future architectures. This work targets technology developed for scalable application tracing of communication events. It focuses on extreme-scale simulation of HPC applications and their communication behavior via lightweight parallel discrete event simulation for performance estimation and evaluation. Instead of simply replaying a trace within a simulator, this work promotes the generation of a benchmark from traces. This benchmark is subsequently exposed to simulation using models to reflect the performance characteristics of future-generation HPC systems. This technique provides a number of benefits, such as eliminating the data intensive trace replay and enabling simulations at different scales. The presented work features novel software co-design aspects, combining the ScalaTrace tool to generate scalable trace files, the ScalaBenchGen tool to generate the benchmark, and the xSim tool to assess the benchmark characteristics within a simulator.}, journal={2016 IEEE/ACM 20TH INTERNATIONAL SYMPOSIUM ON DISTRIBUTED SIMULATION AND REAL TIME APPLICATIONS (DS-RT)}, author={Lagadapati, Mahesh and Mueller, Frank and Engelmann, Christian}, year={2016}, pages={9–18} } @inbook{ramachandran_mueller_2016, title={Distributed Job Allocation for Large-Scale Manycores}, ISBN={9783319413204 9783319413211}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-41321-1_21}, DOI={10.1007/978-3-319-41321-1_21}, abstractNote={Contemporary operating systems heavily rely on single system images with shared memory constructs that may not scale well to large core counts. We consider the challenge of distributed job allocation, where each job is comprised of a set of tasks to be mapped to disjoint cores. A naive solution performing fragmented allocations may quickly escalate to deadlocks, where jobs hold and wait for cores in circular dependencies. To tackle these challenges, we propose a deadlock free distributed job allocation protocol. We have devised two policies for avoiding deadlocks, namely active cancellation and sequencer-based atomic broadcast. The protocol and the two policies have been implemented and evaluated on a Tilera TilePro64 processor with 64 cores on a single socket. Results show sparse job allocations to incur lower overhead for active cancellation while sequencer-based atomic broadcast has less overhead for denser allocations.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer International Publishing}, author={Ramachandran, Subramanian and Mueller, Frank}, year={2016}, pages={404–425} } @inproceedings{ramachandran_mueller_2016, title={Distributed job allocation for large-scale manycores}, volume={9697}, booktitle={High performance computing}, author={Ramachandran, S. and Mueller, F.}, year={2016}, pages={404–425} } @inbook{yagna_patil_mueller_2016, title={Efficient and Predictable Group Communication for Manycore NoCs}, ISBN={9783319413204 9783319413211}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-41321-1_20}, DOI={10.1007/978-3-319-41321-1_20}, abstractNote={Massive manycore embedded processors with network-on-chip (NoC) architectures are becoming common. These architectures provide higher processing capability due to an abundance of cores. They provide native core-to-core communication that can be exploited via message passing to provide system scalability. Despite these advantages, manycores pose predictability challenges that can affect both performance and real-time capabilities. In this work, we develop efficient and predictable group communication using message passing specifically designed for large core counts in 2D mesh NoC architectures. We have implemented the most commonly used collectives in such a way that they incur low latency and high timing predictability making them suitable for balanced parallelization of scalable high-performance and embedded/real-time systems alike. Experimental results on a single-die 64 core hardware platform show that our collectives can significantly reduce communication times by up to 95 % for single packet messages and up to 98 % for longer messages with superior performance for sometimes all message sizes and sometimes only small message sizes depending on the group primitive. In addition, our communication primitives have significantly lower variance than prior approaches, thereby providing more balanced parallel execution progress and better real-time predictability.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer International Publishing}, author={Yagna, Karthik and Patil, Onkar and Mueller, Frank}, year={2016}, pages={383–403} } @inproceedings{yagna_patil_mueller_2016, title={Efficient and predictable group communication for manycore NoCs}, volume={9697}, booktitle={High performance computing}, author={Yagna, K. and Patil, O. and Mueller, F.}, year={2016}, pages={383–403} } @article{bahmani_mueller_2016, title={Efficient clustering for ultra-scale application tracing}, volume={98}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2016.08.001}, abstractNote={Extreme-scale computing poses a number of challenges to application performance. Developers need to study application behavior by collecting detailed information with the help of tracing toolsets to determine shortcomings. But not only applications are "scalability challenged", current tracing toolsets also fall short of exascale requirements for low background overhead since trace collection for each execution entity is becoming infeasible. One effective solution is to cluster processes with the same behavior into groups. Instead of collecting performance information from each individual node, this information can be collected from just a set of representative nodes. This work contributes a fast, scalable, signature-based clustering algorithm that clusters processes exhibiting similar execution behavior. Instead of prior work based on statistical clustering, our approach produces precise results nearly without loss of events or accuracy. The proposed algorithm combines low overhead at the clustering level with log(P) time complexity, and it splits the merge process to make tracing suitable for extreme-scale computing. Overall, this multi-level precise clustering based on signatures further generalizes to a novel multi-metric clustering technique with unprecedented low overhead.}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Bahmani, Amir and Mueller, Frank}, year={2016}, month={Dec}, pages={25–39} } @article{elliott_hoemmen_mueller_2016, title={Exploiting data representation for fault tolerance}, volume={14}, ISSN={1877-7503}, url={http://dx.doi.org/10.1016/J.JOCS.2015.12.002}, DOI={10.1016/J.JOCS.2015.12.002}, abstractNote={Incorrect computer hardware behavior may corrupt intermediate computations in numerical algorithms, possibly resulting in incorrect answers. Prior work models misbehaving hardware by randomly flipping bits in memory. We start by accepting this premise, and present an analytic model for the error introduced by a bit flip in an IEEE 754 floating-point number. We then relate this finding to the linear algebra concepts of normalization and matrix equilibration. In particular, we present a case study illustrating that normalizing both vector inputs of a dot product minimizes the probability of a single bit flip causing a large error in the dot product's result. Furthermore, the absolute error is either less than one or very large, which allows detection of large errors. Then, we apply this to the GMRES iterative solver. We count all possible errors that can be introduced through faults in arithmetic in the computationally intensive orthogonalization phase of GMRES, and show that when the matrix is equilibrated, the absolute error is bounded above by one.}, journal={Journal of Computational Science}, publisher={Elsevier BV}, author={Elliott, J. and Hoemmen, M. and Mueller, F.}, year={2016}, month={May}, pages={51–60} } @article{fiala_mueller_ferreira_2016, title={FlipSphere: A Software-based DRAM Error Detection and Correction Library for HPC}, ISSN={["1550-6525"]}, DOI={10.1109/ds-rt.2016.27}, abstractNote={Proposed exascale systems will present considerable challenges. In particular, DRAM soft-errors, or bit-flips, are expected to greatly increase due to higher memory densities and near-threshold voltages. To address this challenge, we introduce FlipSphere, a tunable, transparent silent data corruption detection and correction library for HPC applications that is first in its class to use hardware accelerators, such as the Intel Xeon Phi MIC, to increase application resilience. FlipSphere provides comprehensive silent data corruption protection for application memory by implementing on-demand page integrity verification coupled with a software-based error correcting code that allows for automatic error recovery. Using this framework, we demonstrate the trade-offs of dedicating hardware resources for resilience, showing up to 90% of memory may be protected with a 40% slowdown.}, journal={2016 IEEE/ACM 20TH INTERNATIONAL SYMPOSIUM ON DISTRIBUTED SIMULATION AND REAL TIME APPLICATIONS (DS-RT)}, author={Fiala, David and Mueller, Frank and Ferreira, Kurt B.}, year={2016}, pages={19–28} } @inproceedings{chandru_mueller_2016, title={Hybrid MPI/OpenMP programming on the Tilera manycore architecture}, DOI={10.1109/hpcsim.2016.7568353}, abstractNote={This work assesses the viability of different programming models for large-scale manycores using an MPI-like abstraction, the vendor's OpenMP, and a combination (hybrid) of both. Experiments with Tilera's TilePro64 demonstrate that MPI and OpenMP both scale while the hybrid model performs inferior to the others. Further, network-on-chip contention significantly affects performance and its variance, especially if the number of utilized cores is high. These findings provide an early insight on trends for single die/chip solutions with large numbers of cores, which will become mainstream in HPC in the coming years. Prior work lacks such a study on large manycores with an efficient native on-chip message passing on shared memory on the same hardware platform.}, booktitle={2016 International Conference on High Performance Computing & Simulation (HPCS 2016)}, author={Chandru, V. and Mueller, F.}, year={2016}, pages={326–333} } @inproceedings{das_mueller_gu_iyengar_2016, title={Performance analysis of a multi-tenant in-memory data grid}, DOI={10.1109/cloud.2016.0144}, abstractNote={Distributed key-value stores have become indispensable for large scale low latency applications. Many cloud services have deployed in-memory data grids for their enterprise infrastructures and support multi-tenancy services. But it is still difficult to provide consistent performance to all tenants for fluctuating workloads that need to scale out. Many popular key-value stores suffer from performance problems at scale and different tenant requirements. To this front, we present our study with Hazelcast, a popular open source data grid, and provide insights to contention and performance bottlenecks. Through experimental analysis, this paper uncovers scenarios of performance degradation followed by optimized performance via end-point multiplexing. Our study suggests that processing increasing number of client requests spawning fewer number of threads help improve performance.}, booktitle={Proceedings of 2016 ieee 9th international conference on cloud computing (cloud)}, author={Das, A. and Mueller, F. and Gu, X. H. and Iyengar, A.}, year={2016}, pages={956–959} } @inbook{chandru_mueller_2016, title={Reducing NoC and Memory Contention for Manycores}, ISBN={9783319306940 9783319306957}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-30695-7_22}, DOI={10.1007/978-3-319-30695-7_22}, abstractNote={Platforms consisting of many computing cores have become the mainstream in high performance computing, general purpose-computing and, lately, embedded systems. Such systems provide increased processing power and system availability, but often impose latencies and contention for memory accesses as multiple cores try to reference data at the same time. This may result in sub-optimal performance unless special allocation policies are employed. On a multi-processor board with 50 or more processing cores, the NoC (Network On Chip) adds to this challenge. This work evaluates the impact of bank-aware and controller-aware allocation on NoC contention. Experiments show that targeted memory allocation results in reduced execution times and NoC contention, the latter of which has not been studied before at this scale.}, booktitle={Architecture of Computing Systems – ARCS 2016}, publisher={Springer International Publishing}, author={Chandru, Vishwanathan and Mueller, Frank}, year={2016}, pages={293–305} } @inproceedings{leon_smith_oates_miles_2016, title={Sensitivity analysis for a quantum informed ferroelectric energy model}, DOI={10.1115/smasis2016-9035}, abstractNote={We perform global sensitivity analysis for parameters in a continuum energy model for ferroelectric materials, which is informed by density functional theory (DFT). Specifically, we use global sensitivity analysis to rank the sensitivity of phenomeno-logical parameters governing the Landau and electrostriction energy for single-domain ferroelectric lead titanate. These techniques include Pearson correlations constructed directly from input and output relations, Sobol sensitivity indices, and Morris indices.}, booktitle={Proceedings of the asme conference on smart materials adaptive}, author={Leon, L. S. and Smith, R. C. and Oates, W. S. and Miles, P.}, year={2016} } @article{bahmani_sibley_parsian_owzar_mueller_2016, title={SparkScore: Leveraging Apache Spark for Distributed Genomic Inference}, ISSN={["2164-7062"]}, DOI={10.1109/ipdpsw.2016.6}, abstractNote={The method of the efficient score statistic is used extensively to conduct inference for high throughput genomic data due to its computational efficiency and abilityto accommodate simple and complex phenotypes. Inference based on these statistics can readily incorporate a priori knowledge from a vast collection of bioinformatics databases to further refine the analyses. The sampling distribution of the efficient score statistic is typically approximated using asymptotics. As this may be inappropriate in the context of small study size, or uncommon or rare variants, resampling methods are often used to approximate the exact sampling distribution. We propose SparkScore, a set of distributed computational algorithms implemented in Apache Spark, to leverage the embarrassingly parallel nature of genomic resampling inference on the basis of the efficient score statistics. We illustrate the application of this computational approachfor the analysis of data from genome-wide analysis studies(GWAS). This computational approach also harnesses thefault-tolerant features of Spark and can be readily extended to analysis of DNA and RNA sequencing data, including expression quantitative trait loci (eQTL) and phenotype association studies.}, journal={2016 IEEE 30TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM WORKSHOPS (IPDPSW)}, author={Bahmani, Amir and Sibley, Alexander B. and Parsian, Mahmoud and Owzar, Kouros and Mueller, Frank}, year={2016}, pages={435–442} } @article{pan_gownivaripalli_mueller_2016, title={TintMalloc: Reducing Memory Access Divergence via Controller-Aware Coloring}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2016.26}, abstractNote={DRAM memory of modern multicores is partitioned into sets, each with its own memory controller governing multiple banks. Accesses can be served in parallel to controllers and banks, but sharing of either between threads results in contention that increases latency, and so do accesses to remote controllers due to the non-uniform memory access (NUMA) design. Above DRAM, a last-level cache (LLC), typically at level 3 (L3), is shared by all cores while L1 and L2 caches tend to be core private. This NUMA design inflicts significant variations in execution time for applications with large datasets due to different latencies incurred by remote memory node accesses or contention in LLC and at memory banks/controllers. As a result, single program multiple data (SPMD) applications tend to experience computational imbalance at barriers, which inflicts idle (wait) time for threads that at barriers arrive early and thus impairs effective processor utilization and ultimately performance. This work contributes a novel memory allocator called Tint-Malloc that colors memory at the LLC, bank, and controller level to ensure locality to the local memory node while reducing contention at the LLC/bank levels in software. After adding one line of code during initialization in each thread, existing applications automatically obtain colored heap space through regular malloc calls. Experimental results with the SPEC and Parsec benchmarks show that by choosing disjoint colors per thread, locality is increased, contention is decreased, and overall SPMD execution becomes more balanced atbarriers than default memory allocation under Linux as well as prior coloring approaches.}, journal={2016 IEEE 30TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2016)}, author={Pan, Xing and Gownivaripalli, Yasaswini Jyothi and Mueller, Frank}, year={2016}, pages={363–372} } @article{luo_edwards_luo_mueller_2015, title={A fine-grained block ILU scheme on regular structures for GPGPUs}, volume={119}, ISSN={["1879-0747"]}, DOI={10.1016/j.compfluid.2015.07.005}, abstractNote={Iterative methods based on block incomplete LU (BILU) factorization are considered highly effective for solving large-scale block-sparse linear systems resulting from coupled PDE systems with n equations. However, efforts on porting implicit PDE solvers to massively parallel shared-memory heterogeneous architectures, such as general-purpose graphics processing units (GPGPUs), have largely avoided BILU, leaving their enormous performance potential unfulfilled in many applications where the use of implicit schemes and BILU-type preconditioners/solvers is highly preferred. Indeed, strong inherent data dependency and high memory bandwidth demanded by block matrix operations render naive adoptions of existing sequential BILU algorithms extremely inefficient on GPGPUs. In this study, we present a fine-grained BILU (FGBILU) scheme which is particularly effective on GPGPUs. A straightforward one-sweep wavefront ordering is employed to resolve data dependency. Granularity is substantially refined as block matrix operations are carried out in a true element-wise approach. Particularly, the inversion of diagonal blocks, a well-known bottleneck, is accomplished by a parallel in-place Gauss–Jordan elimination. As a result, FGBILU is able to offer low-overhead concurrent computation at O(n2N2) scale on a 3D PDE domain with a linear scale of N. FGBILU has been implemented with both OpenACC and CUDA and tested as a block-sparse linear solver on a structured 3D grid. While FGBILU remains mathematically identical to sequential global BILU, numerical experiments confirm its exceptional performance on an Nvidia GPGPU.}, journal={COMPUTERS & FLUIDS}, author={Luo, Lixiang and Edwards, Jack R. and Luo, Hong and Mueller, Frank}, year={2015}, month={Sep}, pages={149–161} } @article{shekhar_ramaprasad_sarkar_mueller_2015, title={Architecture aware semi partitioned real-time scheduling on multicore platforms}, volume={51}, ISSN={["1573-1383"]}, DOI={10.1007/s11241-015-9221-4}, number={3}, journal={REAL-TIME SYSTEMS}, author={Shekhar, Mayank and Ramaprasad, Harini and Sarkar, Abhik and Mueller, Frank}, year={2015}, month={Jun}, pages={274–313} } @article{rezaei_mueller_2015, title={DINO: Divergent Node Cloning for Sustained Redundancy in HPC}, ISSN={["1552-5244"]}, DOI={10.1109/cluster.2015.36}, abstractNote={Soft faults like silent data corruption and hard faults like hardware failures may cause a high performance computing (HPC) job of thousands of processes to nearly cease to make progress due to recovery overheads. Redundant computing has been proposed as a solution at extreme scale by allocating two or more processes to perform the same task. However, current redundant computing approaches do not repair failed replicas. Thus, SDC-free execution is not guaranteed after a replica failure and the job may finish with incorrect results. Replicas are logically equivalent, yet may have divergent runtime states during job execution, which complicates on-the-fly repairs for forward recovery. In this work, we present a redundant execution environment that quickly repairs hard failures via Divergent Node cloning (DINO) at the MPI task level. DINO contributes a novel task cloning service integrated into the MPI runtime system that solves the problem of consolidating divergent states among replicas on-the-fly. Experimental results indicate that DINO can recover from failures nearly instantaneously, thus retaining the redundancy level throughout job execution. The cloning overhead, depending on the process image size and its transfer rate, ranges from 5.60 to 90.48 seconds. To the best of our knowledge, the design and implementation for repairing failed replicas in redundant MPI computing is unprecedented.}, journal={2015 IEEE INTERNATIONAL CONFERENCE ON CLUSTER COMPUTING - CLUSTER 2015}, author={Rezaei, Arash and Mueller, Frank}, year={2015}, pages={180–183} } @article{shekhar_ramaprasad_mueller_2015, title={Evaluation of Memory Access Arbitration Algorithm on Tilera's TILEPro64 platform}, ISSN={["2576-3512"]}, DOI={10.1109/hpcc-css-icess.2015.245}, abstractNote={As real-time embedded systems demand more and more computing power under reasonable energy budgets, multi-core platforms are a viable option. However, deploying real-time applications on multi-core platforms introduce several predictability challenges. One of these challenges is bounding the latency of memory accesses issued by real-time tasks. This challenge is exacerbated as the number of cores and, hence, the degree of resource sharing increases. Over the last several years, researchers have proposed techniques to overcome this challenge. In prior work, we proposed an arbitration policy for memory access requests over a Network-on-Chip. In this paper, we implement and evaluate variants of our arbitration policy on a real hardware platform, namely Tilera's TilePro64 platform.}, journal={2015 IEEE 17TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING AND COMMUNICATIONS, 2015 IEEE 7TH INTERNATIONAL SYMPOSIUM ON CYBERSPACE SAFETY AND SECURITY, AND 2015 IEEE 12TH INTERNATIONAL CONFERENCE ON EMBEDDED SOFTWARE AND SYSTEMS (ICESS)}, author={Shekhar, Mayank and Ramaprasad, Harini and Mueller, Frank}, year={2015}, pages={1154–1159} } @article{qian_mueller_xin_2015, title={Hybrid EDF Packet Scheduling for Real-Time Distributed Systems}, ISSN={["1068-3070"]}, DOI={10.1109/ecrts.2015.11}, abstractNote={When multiple computational resource elements collaborate to handle events in a cyber-physical system, scheduling algorithms on these resource elements and the communication delay between them contribute to the overall system utilization and schedulability. Employing earliest deadline first (EDF) scheduling in real-time cyber-physical systems has many challenges. First, the network layer of a resource has to interrupt and notify the scheduler about the deadlines of arrived messages. The randomness of interruption makes context switch costs unpredictable. Second, lack of globally synchronized clocks across resources renders event deadlines derived from local clocks and piggybacked in messages meaningless. Third, communication delay variances in a network increase the unpredictability of the system, e.g., When multiple resources transmit message bursts simultaneously. We address these challenges in this work. First, we combine EDF scheduling with periodic message transmission tasks. Then, we implement an EDF-based packet scheduler, which transmits packets considering event deadlines. Third, we employ bandwidth limitations on the transmission links of resources to decrease network contention and network delay variance. We have implemented our hybrid EDF scheduler in a real-time distributed storage system. We evaluate it on a cluster of nodes in a switched network environment resembling a distributed cyber-physical system to demonstrate the real-time capability of our scheduler.}, journal={PROCEEDINGS OF THE 2015 27TH EUROMICRO CONFERENCE ON REAL-TIME SYSTEMS (ECRTS 2015)}, author={Qian, Tao and Mueller, Frank and Xin, Yufeng}, year={2015}, pages={37–46} } @article{zimmer_bhat_mueller_mohan_2015, title={Intrusion Detection for CPS Real-Time Controllers}, ISBN={["978-3-662-45927-0"]}, ISSN={["1860-4676"]}, DOI={10.1007/978-3-662-45928-7_12}, abstractNote={Security in CPS-based real-time embedded systems controlling the power grid has been an afterthought, but it is becoming a critical issue as CPS systems are networked and inter-dependent. This work presents a set of mechanisms for timebased intrusion detection, i.e., the execution of unauthorized instructions in realtime CPS environments. The novelty is the utilization of information obtained by static timing analysis for intrusion detection. Real-time CPS systems are unique in that timing bounds on code sections are readily available since they are required for schedulability analysis.We demonstrate how micro-timings can be exploited for multiple granularity levels of application code to track execution progress. Through bounds checking of these micro-timings, we develop techniques to detect intrusions (1) in a self-checking manner by the application and (2) through the operating system scheduler, which are novel contributions to the real-time/embedded systems domain to the best of our knowledge.}, journal={CYBER PHYSICAL SYSTEMS APPROACH TO SMART ELECTRIC POWER GRID}, author={Zimmer, Christopher and Bhat, Balasubramany and Mueller, Frank and Mohan, Sibin}, year={2015}, pages={329–358} } @article{zimmer_mueller_2015, title={NoCMsg: A Scalable Message-Passing Abstraction for Network-on-Chips}, volume={12}, ISSN={["1544-3973"]}, DOI={10.1145/2701426}, abstractNote={The number of cores of contemporary processors is constantly increasing and thus continues to deliver ever higher peak performance (following Moore’s transistor law). Yet high core counts present a challenge to hardware and software alike. Following this trend, the network-on-chip (NoC) topology has changed from buses over rings and fully connected meshes to 2D meshes.}, number={1}, journal={ACM TRANSACTIONS ON ARCHITECTURE AND CODE OPTIMIZATION}, author={Zimmer, Christopher and Mueller, Frank}, year={2015}, month={Apr} } @article{xia_lou_luo_edwards_mueller_2015, title={OpenACC acceleration of an unstructured CFD solver based on a reconstructed discontinuous Galerkin method for compressible flows}, volume={78}, ISSN={0271-2091}, url={http://dx.doi.org/10.1002/FLD.4009}, DOI={10.1002/fld.4009}, abstractNote={Summary}, number={3}, journal={International Journal for Numerical Methods in Fluids}, publisher={Wiley}, author={Xia, Yidong and Lou, Jialin and Luo, Hong and Edwards, Jack and Mueller, Frank}, year={2015}, month={Feb}, pages={123–139} } @inproceedings{panchamukhi_mueller_2015, title={Providing task isolation via TLB coloring}, DOI={10.1109/rtas.2015.7108391}, abstractNote={The translation look aside buffer (TLB) improves the performance of systems by caching the virtual page to physical frame mapping. But TLBs present a source of unpredictability for real-time systems. Standard heap allocated regions do not provide guarantees on the TLB set that will hold a particular page translation. This unpredictability can lead to TLB misses with a penalty of up to thousands of cycles and consequently intra- and inter-task interference resulting in loose bounds on the worst case execution time (WCET) and TLB-related preemption delay. In this work, we design and implement a new heap allocator that guarantees the TLB set, which will hold a particular page translation on a uniprocessor of a contemporary architecture. The allocator is based on the concept of page coloring, a software TLB partitioning method. Virtual pages are colored such that two pages of different color cannot map to the same TLB set. Our experimental evaluations confirm the unpredictability associated with the standard heap allocation. Using a set of synthetic and standard benchmarks, we show that our allocator provides task isolation for real-time tasks. To the best of our knowledge, such TLB isolation without special hardware support is unprecedented, increases TLB predictability and can facilitate WCET analysis.}, booktitle={21st IEEE Real-Time and Embedded Technology and Applications Symposium (RTAS 2015)}, author={Panchamukhi, S. A. and Mueller, F.}, year={2015}, pages={3–13} } @article{zimmer_mueller_2015, title={Reliable and scalable communication for the power grid}, DOI={10.1007/978-3-662-45928-7_8}, abstractNote={Future smart power grids require constant data availability for actuation of control decisions. The job of ensuring the timely arrival of data falls onto the network that connects these intelligent devices. This network needs to be fault tolerant. When nodes, devices or communication links fail along a default route of a message from A to B, the underlying hardware and software layers should ensure that this message will actually be delivered as long as alternative routes exist. Existence and discovery of multi-route pathways is essential in ensuring delivery of critical data. In this work, we present methods of developing network topologies of smart devices that enable multi-route discovery in an intelligent power grid. This is accomplished through the utilization of software overlays that (1) maintain a digital structure for the physical network and (2) identify new routes in the case of faults. The resulting cyber network structure is scalable, reliable and inexpensive to build by extending existing infrastructure.}, journal={Cyber physical systems approach to smart electric power grid}, author={Zimmer, C. and Mueller, F.}, year={2015}, pages={195–217} } @article{sarkar_mueller_ramaprasad_2015, title={Static Task Partitioning for Locked Caches in Multicore Real-Time Systems}, volume={14}, ISSN={["1558-3465"]}, DOI={10.1145/2638557}, abstractNote={ Growing processing demand on multitasking real-time systems can be met by employing scalable multicore architectures. For such environments, locking cache lines for hard real-time systems ensures timing predictability of data references and may lower worst-case execution time. This work studies the benefits of cache locking on massive multicore architectures with private caches in the context of hard real-time systems. In shared cache architectures, the cache is a single resource shared among all of the tasks. However, in scalable cache architectures with private caches, conflicts exist only among the tasks scheduled on one core. This calls for a cache-aware allocation of tasks onto cores. }, number={1}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Sarkar, Abhik and Mueller, Frank and Ramaprasad, Harini}, year={2015}, month={Jan} } @inproceedings{qian_mueller_xin_2014, title={A real-time distributed hash table}, DOI={10.1109/rtcsa.2014.6910537}, abstractNote={Currently, the North American power grid uses a centralized system to monitor and control wide-area power grid states. This centralized architecture is becoming a bottleneck as large numbers of wind and photo-voltaic (PV) generation sources require real-time monitoring and actuation to ensure sustained reliability. We have designed and implemented a distributed storage system, a real-time distributed hash table (DHT), to store and retrieve this monitoring data as a real-time service to an upper layer decentralized control system. Our real-time DHT utilizes the DHT algorithm Chord in a cyclic executive to schedule data-lookup jobs on distributed storage nodes. We formally define the pattern of the workload on our real-time DHT and use queuing theory to stochastically derive the time bound for response times of these lookup requests. We also define the quality of service (QoS) metrics of our real-time DHT as the probability that deadlines of requests can be met. We use the stochastic model to derive the QoS. An experimental evaluation on distributed nodes shows that our model is well suited to provide time bounds for requests following typical workload patterns and that a prioritized extension can increase the probability of meeting deadlines for subsequent requests.}, booktitle={2014 IEEE 20th International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)}, author={Qian, T. and Mueller, F. and Xin, Y. F.}, year={2014} } @inproceedings{qian_chakrabortty_mueller_xin_2014, title={A real-time distributed storage system for multi-resolution virtual synchrophasor}, ISBN={9781479964154}, url={http://dx.doi.org/10.1109/pesgm.2014.6939832}, DOI={10.1109/pesgm.2014.6939832}, abstractNote={With the continuing large-scale deployment of Phasor Measurement Units (PMU), the Wide-Area Measurement System (WAMS) technology is envisioned to evolve towards a distributed architecture where multiple sets of distributed Phasor Data Concentrators (PDCs) collectively process PMU data to achieve real-time distributed intelligence. Emerging applications developed under this vision will pose stringent but heterogeneous real-time requirements on throughput, delay, and reliability performance of the underlying communication and computing infrastructure. To address this problem, we present a novel virtual PMU (vPMU) architecture that decomposes phasor samples into multiple resolution layers. For a particular receiver with a certain resolution requirement, a complete set of PMU data can be composed by combining samples from the lower layers, without the need for samples from higher layers. We design and implement a real-time distributed storage system to support the virtual PMU data communication. We extend the Chord algorithm so that the response time of data communication can be bounded by our storage system. In addition, we use queuing theory to analyze the response time of requests with our stochastic model.}, booktitle={2014 IEEE PES General Meeting | Conference & Exposition}, publisher={IEEE}, author={Qian, Tao and Chakrabortty, Aranya and Mueller, Frank and Xin, Yufeng}, year={2014}, month={Jul} } @article{elliott_hoemmen_mueller_2014, title={Evaluating the Impact of SDC on the GMRES Iterative Solver}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2014.123}, abstractNote={Increasing parallelism and transistor density, along with increasingly tighter energy and peak power constraints, may force exposure of occasionally incorrect computation or storage to application codes. Silent data corruption (SDC) will likely be infrequent, yet one SDC suffices to make numerical algorithms like iterative linear solvers cease progress towards the correct answer. Thus, we focus on resilience of the iterative linear solver GMRES to a single transient SDC. We derive inexpensive checks to detect the effects of an SDC in GMRES that work for a more general SDC model than presuming a bit flip. Our experiments show that when GMRES is used as the inner solver of an inner-outer iteration, it can "run through" SDC of almost any magnitude in the computationally intensive orthogonalization phase. That is, it gets the right answer using faulty data without any required roll back. Those SDCs which it cannot run through, get caught by our detection scheme.}, journal={2014 IEEE 28TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM}, author={Elliott, James and Hoemmen, Mark and Mueller, Frank}, year={2014} } @article{zimmer_mueller_2014, title={NoCMsg: Scalable NoC-Based Message Passing}, ISSN={["2376-4414"]}, DOI={10.1109/ccgrid.2014.19}, abstractNote={Current processor design with ever more cores may ensure that theoretical compute performance still follows past increases (resting from Moore's law), but they also increasingly present a challenge to hardware and software alike. As the core count increases, the network-on-chip(NoC) topology has changed from buses over rings and fully connected meshes to 2D meshes. The question is which programming paradigm provides the scalability needed to ensure performance is close to theoretical peak, where 2D meshes provide the most scalable design to date. This work contributes NoCMsg, a low-level message passing abstraction over NoCs. NoCMsg is specifically designed for large core counts in2D meshes. Its design ensures deadlock free messaging for wormhole Manhattan-path routing over the NoC. Experimental results on the Tile Pro hardware platform show that NoCMsg can significantly reduce communication times by up to 86% for single packet messages and up to40% for larger messages compared to other NoC-based message approaches. Results further demonstrate the potential of NoC messaging to outperform shared memory abstractions by up to 93% as core counts and inter-process communication increase, i.e., we observe that shared memory scales up to about 16 cores while message passing performs well beyond that threshold on this platform. To the best of our knowledge, this is the first head-on comparison of shared memory and advanced message passing specifically designed for NoCs on an actual hardware platform with larger core counts on a single socket.}, journal={2014 14TH IEEE/ACM INTERNATIONAL SYMPOSIUM ON CLUSTER, CLOUD AND GRID COMPUTING (CCGRID)}, author={Zimmer, Christopher and Mueller, Frank}, year={2014}, pages={186–195} } @inbook{ananthakrishnan_mueller_2014, title={ScalaJack: Customized Scalable Tracing with In-situ Data Analysis}, ISBN={9783319098722 9783319098739}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-09873-9_2}, DOI={10.1007/978-3-319-09873-9_2}, abstractNote={Root cause diagnosis of large-scale HPC applications often fails because tools, specifically trace-based ones, can no longer record all metrics they measure. We address this problems by combining customized tracing and providing support for in-situ data analysis via ScalaJack, a framework with customizable instrumentation and pluggable extension capabilities for problem directed instrumentation and in-situ data analysis. We further eliminate cross cutting concerns by code refactoring for aspect orientation and evaluate these capabilities in case studies within and beyond the scope of tracing.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer International Publishing}, author={Ananthakrishnan, Srinath Krishna and Mueller, Frank}, year={2014}, pages={13–25} } @inbook{lagadapati_mueller_engelmann_2014, title={Tools for Simulation and Benchmark Generation at Exascale}, ISBN={9783319081434 9783319081441}, url={http://dx.doi.org/10.1007/978-3-319-08144-1_2}, DOI={10.1007/978-3-319-08144-1_2}, booktitle={Tools for High Performance Computing 2013}, publisher={Springer International Publishing}, author={Lagadapati, Mahesh and Mueller, Frank and Engelmann, Christian}, year={2014}, pages={19–24} } @inproceedings{li_yang_dai_yan_mueller_zhou_2014, title={Understanding the tradeoffs between software-managed vs. hardware-managed caches in GPUs}, booktitle={Ieee international symposium on performance analysis of systems and}, author={Li, C. and Yang, Y. and Dai, H. W. and Yan, S. G. and Mueller, F. and Zhou, H. Y.}, year={2014}, pages={231–241} } @article{zhang_mueller_2013, title={Autogeneration and Autotuning of 3D Stencil Codes on Homogeneous and Heterogeneous GPU Clusters}, volume={24}, ISSN={["1558-2183"]}, DOI={10.1109/tpds.2012.160}, abstractNote={This paper develops and evaluates search and optimization techniques for autotuning 3D stencil (nearest neighbor) computations on GPUs. Observations indicate that parameter tuning is necessary for heterogeneous GPUs to achieve optimal performance with respect to a search space. Our proposed framework takes a most concise specification of stencil behavior from the user as a single formula, autogenerates tunable code from it, systematically searches for the best configuration and generates the code with optimal parameter configurations for different GPUs. This autotuning approach guarantees adaptive performance for different generations of GPUs while greatly enhancing programmer productivity. Experimental results show that the delivered floating point performance is very close to previous handcrafted work and outperforms other autotuned stencil codes by a large margin. Furthermore, heterogeneous GPU clusters are shown to exhibit the highest performance for dissimilar tuning parameters leveraging proportional partitioning relative to single-GPU performance.}, number={3}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Zhang, Yongpeng and Mueller, Frank}, year={2013}, month={Mar}, pages={417–427} } @article{mueller_2013, title={Best papers, IPDPS 2011}, volume={73}, ISSN={["0743-7315"]}, DOI={10.1016/j.jpdc.2013.05.001}, number={7}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Mueller, Frank}, year={2013}, month={Jul}, pages={939–939} } @inproceedings{zhang_mueller_2013, title={HiDP: A hierarchical data parallel language}, booktitle={Proceedings of the 2013 ieee/acm international symposium on code generation and optimization (cgo)}, author={Zhang, Y. P. and Mueller, F.}, year={2013}, pages={171–181} } @inbook{fiala_ferreira_mueller_engelmann_2012, title={A Tunable, Software-Based DRAM Error Detection and Correction Library for HPC}, ISBN={9783642297397 9783642297403}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-29740-3_29}, DOI={10.1007/978-3-642-29740-3_29}, abstractNote={Proposed exascale systems will present a number of considerable resiliency challenges. In particular, DRAM soft-errors, or bit-flips, are expected to greatly increase due to the increased memory density of these systems. Current hardware-based fault-tolerance methods will be unsuitable for addressing the expected soft error frequency rate. As a result, additional software will be needed to address this challenge. In this paper we introduce LIBSDC, a tunable, transparent silent data corruption detection and correction library for HPC applications. LIBSDC provides comprehensive SDC protection for program memory by implementing on-demand page integrity verification. Experimental benchmarks with Mantevo HPCCG show that once tuned, LIBSDC is able to achieve SDC protection with 50% overhead of resources, less than the 100% needed for double modular redundancy.}, booktitle={Euro-Par 2011: Parallel Processing Workshops}, publisher={Springer Berlin Heidelberg}, author={Fiala, David and Ferreira, Kurt B. and Mueller, Frank and Engelmann, Christian}, year={2012}, pages={251–261} } @article{elliott_kharbas_fiala_mueller_ferreira_engelmann_2012, title={Combining Partial Redundancy and Checkpointing for HPC}, ISSN={["1063-6927"]}, DOI={10.1109/icdcs.2012.56}, abstractNote={Today's largest High Performance Computing (HPC) systems exceed one Petaflops (1015 floating point operations per second) and exascale systems are projected within seven years. But reliability is becoming one of the major challenges faced by exascale computing. With billion-core parallelism, the mean time to failure is projected to be in the range of minutes or hours instead of days. Failures are becoming the norm rather than the exception during execution of HPC applications. Current fault tolerance techniques in HPC focus on reactive ways to mitigate faults, namely via checkpoint and restart (C/R). Apart from storage overheads, C/R-based fault recovery comes at an additional cost in terms of application performance because normal execution is disrupted when checkpoints are taken. Studies have shown that applications running at a large scale spend more than 50% of their total time saving checkpoints, restarting and redoing lost work. Redundancy is another fault tolerance technique, which employs redundant processes performing the same task. If a process fails, a replica of it can take over its execution. Thus, redundant copies can decrease the overall failure rate. The downside of redundancy is that extra resources are required and there is an additional overhead on communication and synchronization. This work contributes a model and analyzes the benefit of C/R in coordination with redundancy at different degrees to minimize the total wallclock time and resources utilization of HPC applications. We further conduct experiments with an implementation of redundancy within the MPI layer on a cluster. Our experimental results confirm the benefit of dual and triple redundancy - but not for partial redundancy - and show a close fit to the model. At ≈ 80, 000 processes, dual redundancy requires twice the number of processing resources for an application but allows two jobs of 128 hours wallclock time to finish within the time of just one job without redundancy. For narrow ranges of processor counts, partial redundancy results in the lowest time. Once the count exceeds ≈ 770, 000, triple redundancy has the lowest overall cost. Thus, redundancy allows one to trade-off additional resource requirements against wallclock time, which provides a tuning knob for users to adapt to resource availabilities.}, journal={2012 IEEE 32ND INTERNATIONAL CONFERENCE ON DISTRIBUTED COMPUTING SYSTEMS (ICDCS)}, author={Elliott, James and Kharbas, Kishor and Fiala, David and Mueller, Frank and Ferreira, Kurt and Engelmann, Christian}, year={2012}, pages={615–626} } @inproceedings{fiala_mueller_engelmann_riesen_ferreira_brightwell_2012, title={Detection and correction of silent data corruption for large-scale high-performance computing}, DOI={10.1109/sc.2012.49}, abstractNote={Faults have become the norm rather than the exception for high-end computing clusters. Exacerbating this situation, some of these faults remain undetected, manifesting themselves as silent errors that allow applications to compute incorrect results. This paper studies the potential for redundancy to detect and correct soft errors in MPI message-passing applications while investigating the challenges inherent to detecting soft errors within MPI applications by providing transparent MPI redundancy. By assuming a model wherein corruption in application data manifests itself by producing differing MPI messages between replicas, we study the best suited protocols for detecting and correcting corrupted MPI messages. Using our fault injector, we observe that even a single error can have profound effects on applications by causing a cascading pattern of corruption which in most cases spreads to all other processes. Results indicate that our consistency protocols can successfully protect applications experiencing even high rates of silent data corruption.}, booktitle={International conference for high performance computing networking}, author={Fiala, D. and Mueller, F. and Engelmann, C. and Riesen, R. and Ferreira, K. and Brightwell, R.}, year={2012} } @article{zimmer_mueller_2012, title={Low ContentionMapping of Real-Time Tasks onto a TilePro 64 Core Processor}, ISSN={["1545-3421"]}, DOI={10.1109/rtas.2012.36}, abstractNote={Predictability of task execution is paramount for real-time systems so that upper bounds of execution times can be determined via static timing analysis. Static timing analysis on network-on-chip (NoC) processors may result in unsafe underestimations when the underlying communication paths are not considered. This stems from contention on the underlying network when data from multiple sources share parts of a routing path in the NoC. Contention analysis must be performed to provide safe and reliable bounds. In addition, the overhead incurred by contention due to inter-process communication (IPC) can be reduced by mapping tasks to cores in such a way that contention is minimized. This paper makes several contributions to increase pre-predictability of real-time tasks on NoC architectures. First, we contribute a constraint solver that exhaustively maps real-time tasks onto cores to minimize contention and improve predictability. Second, we develop a novel TDMA-like approach to map communication traces into time frames to ensure separation of analysis for temporally disjoint communication. Third, we contribute a novel multi-heuristic approximation, H Solver, for rapid discovery of low contention solutions. H Solver reduces contention by up to 70% when compared with naive and constrained exhaustive solutions. We evaluate our experiments using a micro-benchmark of task system IPC on the TilePro64, a real, physical NoC processor with 64 cores. To the best of our knowledge, this is the first work to consider IPC for worst-case time frames to simplify analysis and to measure the impact on actual hardware for NoC-based real-time multi core systems.}, journal={2012 IEEE 18TH REAL-TIME AND EMBEDDED TECHNOLOGY AND APPLICATIONS SYMPOSIUM (RTAS)}, author={Zimmer, Christopher and Mueller, Frank}, year={2012}, pages={131–140} } @article{budanur_mueller_gamblin_2012, title={Memory Trace Compression and Replay for SPMD Systems Using Extended PRSDs}, volume={55}, ISSN={["1460-2067"]}, DOI={10.1093/comjnl/bxr071}, abstractNote={Concurrency levels in large-scale supercomputers are rising exponentially, and shared-memory nodes with hundreds of cores and non-uniform memory access latencies are expected within the next decade. However, even current petascale systems with tens of cores per node suffer from memory bottlenecks. As core counts increase, memory issues will become critical for the performance of large-scale supercomputers. Trace analysis tools are thus vital for diagnosing the root causes of memory problems. However, existing memory tracing tools are expensive due to prohibitively large trace sizes, or they collect only statistical summaries and omit potentially valuable information. In this paper, we present ScalaMemTrace, a novel technique for collecting memory traces in a scalable manner. ScalaMemTrace builds on prior trace methods with aggressive compression techniques to allow lossless representation of memory traces for dense algebraic kernels, with near-constant trace size irrespective of the problem size or the number of threads. We further introduce a replay mechanism for ScalaMemTrace traces, and discuss the results of our prototype implementation on the x86_64 architecture.}, number={2}, journal={COMPUTER JOURNAL}, author={Budanur, Sandeep and Mueller, Frank and Gamblin, Todd}, year={2012}, month={Feb}, pages={206–217} } @article{wang_mueller_engelmann_scott_2012, title={Proactive process-level live migration and back migration in HPC environments}, volume={72}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2011.10.009}, abstractNote={As the number of nodes in high-performance computing environments keeps increasing, faults are becoming common place. Reactive fault tolerance (FT) often does not scale due to massive I/O requirements and relies on manual job resubmission. This work complements reactive with proactive FT at the process level. Through health monitoring, a subset of node failures can be anticipated when one’s health deteriorates. A novel process-level live migration mechanism supports continued execution of applications during much of process migration. This scheme is integrated into an MPI execution environment to transparently sustain health-inflicted node failures, which eradicates the need to restart and requeue MPI jobs. Experiments indicate that 1–6.5 s of prior warning are required to successfully trigger live process migration while similar operating system virtualization mechanisms require 13–24 s. This self-healing approach complements reactive FT by nearly cutting the number of checkpoints in half when 70% of the faults are handled proactively. The work also provides a novel back migration approach to eliminate load imbalance or bottlenecks caused by migrated tasks. Experiments indicate the larger the amount of outstanding execution, the higher the benefit due to back migration.}, number={2}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Wang, Chao and Mueller, Frank and Engelmann, Christian and Scott, Stephen L.}, year={2012}, month={Feb}, pages={254–267} } @article{wu_mueller_2012, title={SCALAEXTRAP: Trace-Based Communication Extrapolation for SPMD Programs}, volume={34}, ISSN={["1558-4593"]}, DOI={10.1145/2160910.2160914}, abstractNote={Performance modeling for scientific applications is important for assessing potential application performance and systems procurement in high-performance computing (HPC). Recent progress on communication tracing opens up novel opportunities for communication modeling due to its lossless yet scalable trace collection. Estimating the impact of scaling on communication efficiency still remains nontrivial due to execution-time variations and exposure to hardware and software artifacts.}, number={1}, journal={ACM TRANSACTIONS ON PROGRAMMING LANGUAGES AND SYSTEMS}, author={Wu, Xing and Mueller, Frank}, year={2012}, month={Apr} } @article{wu_deshpande_mueller_2012, title={ScalaBenchGen: Auto-Generation of Communication Benchmarks Traces}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2012.114}, abstractNote={Benchmarks are essential for evaluating HPC hardware and software for petascale machines and beyond. But benchmark creation is a tedious manual process. As a result, benchmarks tend to lag behind the development of complex scientific codes. This work contributes an automated approach to the creation of communication benchmarks. Given an MPI application, we utilize Scala Trace, a loss less and scalable framework to trace communication operations and execution time while abstracting away the computations. A single trace file that reflects the behavior of all nodes is subsequently expanded to C source code by a novel code generator. This resulting benchmark code is compact, portable, human-readable, and accurately reflects the original application's communication characteristics and runtime characteristics. Experimental results demonstrate that generated source code of benchmarks preserves both the communication patterns and the wall clock-time behavior of the original application. Such automatically generated benchmarks not only shorten the transition from application development to benchmark extraction but also facilitate code obfuscation, which is essential for benchmark extraction from commercial and restricted applications.}, journal={2012 IEEE 26TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Wu, Xing and Deshpande, Vivek and Mueller, Frank}, year={2012}, pages={1250–1260} } @inbook{mueller_wu_schulz_de supinski_gamblin_2012, title={ScalaTrace: Tracing, Analysis and Modeling of HPC Codes at Scale}, ISBN={9783642281440 9783642281457}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-28145-7_40}, DOI={10.1007/978-3-642-28145-7_40}, abstractNote={Characterizing the communication behavior of large-scale applications is a difficult and costly task due to code/system complexity and their long execution times. An alternative to running actual codes is to gather their communication traces and then replay them, which facilitates application tuning and future procurements. While past approaches lacked lossless scalable trace collection, we contribute an approach that provides orders of magnitude smaller, if not near constant-size, communication traces regardless of the number of nodes while preserving structural information. We introduce intra- and inter-node compression techniques of MPI events, we develop a scheme to preserve time and causality of communication events, and we present results of our implementation for BlueGene/L. Given this novel capability, we discuss its impact on communication tuning and on trace extrapolation. To the best of our knowledge, such a concise representation of MPI traces in a scalable manner combined with time-preserving deterministic MPI call replay are without any precedence.}, booktitle={Applied Parallel and Scientific Computing}, publisher={Springer Berlin Heidelberg}, author={Mueller, Frank and Wu, Xing and Schulz, Martin and de Supinski, Bronis R. and Gamblin, Todd}, year={2012}, pages={410–418} } @inproceedings{mueller_wu_schulz_supinski_gamblin_2012, title={ScalaTrace: Tracing, analysis and modeling of HPC codes at scale}, volume={7134}, booktitle={Applied parallel and scientific computing, pt ii}, author={Mueller, F. and Wu, X. and Schulz, M. and Supinski, B. R. and Gamblin, T.}, year={2012}, pages={410–418} } @inproceedings{sarkar_mueller_ramaprasad_2012, title={Static task partitioning for locked caches in multi-core real-time systems}, DOI={10.1145/2380403.2380434}, abstractNote={Locking cache lines in hard real-time systems is a common means to ensure timing predictability of data references and to lower bounds on worst-case execution time, especially in a multi-tasking environment. Growing processing demand on multi-tasking real-time systems can be met by employing scalable multi-core architectures, like the recently introduced tile-based architectures. This paper studies the use of cache locking on massive multi-core architectures with private caches in the context of hard real-time systems. In shared cache architectures, a single resource is shared among {\em all} the tasks. However, in scalable cache architectures with private caches, conflicts exist only among the tasks scheduled on one core. This calls for a cache-aware allocation of tasks onto cores. Our work extends the cache-unaware First Fit Decreasing (FFD) algorithm with a Naive locked First Fit Decreasing (NFFD) policy. We further propose two cache-aware static scheduling schemes: (1) Greedy First Fit Decreasing (GFFD) and (2) Colored First Fit Decreasing (CoFFD). This work contributes an adaptation of these algorithms for conflict resolution of partially locked regions. Experiments indicate that NFFD is capable of scheduling high utilization task sets that FFD cannot schedule. Experiments also show that CoFFD consistently outperforms GFFD resulting in lower number of cores and lower system utilization. CoFFD reduces the number of core requirements from 30% to 60% compared to NFFD. With partial locking, the number of cores in some cases is reduced by almost 50% with an increase in system utilization of 10%. Overall, this work is unique in considering the challenges of future multi-core architectures for real-time systems and provides key insights into task partitioning with locked caches for architectures with private caches.}, booktitle={Cases'12: proceedings of the 2012 ACM International Conference on Compilers, Architectures and Synthesis for Embedded Systems}, author={Sarkar, A. and Mueller, F. and Ramaprasad, H.}, year={2012}, pages={161–170} } @article{zhang_mueller_cui_potok_2011, title={Data-intensive document clustering on graphics processing unit (GPU) clusters}, volume={71}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2010.08.002}, abstractNote={Document clustering is a central method to mine massive amounts of data. Due to the explosion of raw documents generated on the Internet and the necessity to analyze them efficiently in various intelligent information systems, clustering techniques have reached their limitations on single processors. Instead of single processors, general-purpose multi-core chips are increasingly deployed in response to diminishing returns in single-processor speedup due to the frequency wall, but multi-core benefits only provide linear speedups while the number of documents in the Internet is growing exponentially. Accelerating hardware devices represent a novel promise for improving the performance for data-intensive problems such as document clustering. They offer more radical designs with a higher level of parallelism but adaptation to novel programming environments. In this paper, we assess the benefits of exploiting the computational power of graphics processing units (GPUs) to study two fundamental problems in document mining, namely to calculate the term frequency–inverse document frequency (TF–IDF) and cluster a large set of documents. We transform traditional algorithms into accelerated parallel counterparts that can be efficiently executed on many-core GPU architectures. We assess our implementations on various platforms, ranging from stand-alone GPU desktops to Beowulf-like clusters equipped with contemporary GPU cards. We observe at least one order of magnitude speedups over CPU-only desktops and clusters. This demonstrates the potential of exploiting GPU clusters to efficiently solve massive document mining problems. Such speedups combined with the scalability potential and accelerator-based parallelization are unique in the domain of document-based data mining, to the best of our knowledge.}, number={2}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Zhang, Yongpeng and Mueller, Frank and Cui, Xiaohui and Potok, Thomas}, year={2011}, month={Feb}, pages={211–224} } @article{bhat_mueller_2011, title={Making DRAM refresh predictable}, volume={47}, ISSN={["1573-1383"]}, DOI={10.1007/s11241-011-9129-6}, abstractNote={Embedded control systems with hard real-time constraints require that deadlines are met at all times or the system may malfunction with potentially catastrophic consequences. Schedulability theory can assure deadlines for a given task set when periods and worst-case execution times (WCETs) of tasks are known. While periods are generally derived from the problem specification, a task's code needs to be statically analyzed to derive safe and tight bounds on its WCET. Such static timing analysis abstracts from program input and considers loop bounds and architectural features, such as pipelining and caching. However, unpredictability due to dynamic memory (DRAM) refresh cannot be accounted for by such analysis, which limits its applicability to systems with static memory (SRAM). In this paper, we assess the impact of DRAM refresh on task execution times and demonstrate how predictability is adversely affected leading to unsafe hard real-time system design. We subsequently contribute a novel and effective approach to overcome this problem through software-initiated DRAM refresh. We develop (1) a pure software and (2) a hybrid hardware/software refresh scheme. Both schemes provide predictable timings and fully replace the classical hardware auto-refresh. We discuss implementation details based on this design for multiple concrete embedded platforms and experimentally assess the benefits of different schemes on these platforms. We further formalize the integration of variable latency memory references into a data-flow framework suitable for static timing analysis to bound a task's memory latencies with regard to their WCET. The resulting predictable execution behavior in the presence of DRAM refresh combined with the additional benefit of reduced access delays is unprecedented, to the best of our knowledge.}, number={5}, journal={REAL-TIME SYSTEMS}, author={Bhat, Balasubramanya and Mueller, Frank}, year={2011}, month={Sep}, pages={430–453} } @article{sarkar_mueller_ramaprasad_2011, title={Predictable Task Migration for Locked Caches in Multi-Core Systems}, volume={46}, ISSN={["1558-1160"]}, DOI={10.1145/2016603.1967696}, abstractNote={Locking cache lines in hard real-time systems is a common means of achieving predictability of cache access behavior and tightening as well as reducing worst case execution time, especially in a multitasking environment. However, cache locking poses a challenge for multi-core hard real-time systems since theoretically optimal scheduling techniques on multi-core architectures assume zero cost for task migration. Tasks with locked cache lines need to proactively migrate these lines before the next invocation of the task. Otherwise, cache locking on multi-core architectures becomes useless as predictability is compromised.}, number={5}, journal={ACM SIGPLAN NOTICES}, author={Sarkar, Abhik and Mueller, Frank and Ramaprasad, Harini}, year={2011}, month={May}, pages={131–140} } @inproceedings{sarkar_mueller_ramaprasad_2011, title={Predictable task migration for locked caches in multi-core systems}, DOI={10.1145/1967677.1967696}, abstractNote={Locking cache lines in hard real-time systems is a common means of achieving predictability of cache access behavior and tightening as well as reducing worst case execution time, especially in a multitasking environment. However, cache locking poses a challenge for multi-core hard real-time systems since theoretically optimal scheduling techniques on multi-core architectures assume zero cost for task migration. Tasks with locked cache lines need to proactively migrate these lines before the next invocation of the task. Otherwise, cache locking on multi-core architectures becomes useless as predictability is compromised. This paper proposes hardware-based push-assisted cache migration as a means to retain locks on cache lines across migrations. We extend the push-assisted migration model with several cache migration techniques to efficiently retain locked cache lines on a bus-based chip multi-processor architecture. We also provide deterministic migration delay bounds that help the scheduler decide which migration technique(s) to utilize to relocate a single or multiple tasks. This information also allows the scheduler to determine feasibility of task migrations, which is critical for the safety of any hard real-time system. Such proactive migration of locked cache lines in multi-cores is unprecedented to our knowledge.}, booktitle={LCTES 11: Proceedings of the ACM Sigplan/Sigbed 2011 Conference on Languages, Complilers, Tools and Theory for Embedded Systems}, author={Sarkar, A. and Mueller, F. and Ramaprasad, H.}, year={2011}, pages={131–140} } @article{wu_mueller_2011, title={ScalaExtrap: Trace-Based Communication Extrapolation for SPMD Programs}, volume={46}, ISSN={["1558-1160"]}, DOI={10.1145/2038037.1941569}, abstractNote={Performance modeling for scientific applications is important for assessing potential application performance and systems procurement in high-performance computing (HPC). Recent progress on communication tracing opens up novel opportunities for communication modeling due to its lossless yet scalable trace collection. Estimating the impact of scaling on communication efficiency still remains non-trivial due to execution-time variations and exposure to hardware and software artifacts. This work contributes a fundamentally novel modeling scheme. We synthetically generate the application trace for large numbers of nodes by extrapolation from a set of smaller traces. We devise an innovative approach for topology extrapolation of single program, multiple data (SPMD) codes with stencil or mesh communication. The extrapolated trace can subsequently be (a) replayed to assess communication requirements before porting an application, (b) transformed to auto-generate communication benchmarks for various target platforms, and (c) analyzed to detect communication inefficiencies and scalability limitations. To the best of our knowledge, rapidly obtaining the communication behavior of parallel applications at arbitrary scale with the availability of timed replay, yet without actual execution of the application at this scale is without precedence and has the potential to enable otherwise infeasible system simulation at the exascale level.}, number={8}, journal={ACM SIGPLAN NOTICES}, author={Wu, Xing and Mueller, Frank}, year={2011}, month={Aug}, pages={113–122} } @article{marathe_thakkar_mueller_2010, title={Feedback-directed page placement for ccNUMA via hardware-generated memory traces}, volume={70}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2010.08.015}, abstractNote={Non-uniform memory architectures with cache coherence (ccNUMA) are becoming increasingly common, not just for large-scale high performance platforms but also in the context of multi-core architectures. Under ccNUMA, data placement may influence overall application performance significantly as references resolved locally to a processor/core impose lower latencies than remote ones. This work develops a novel hardware-assisted page placement paradigm based on automated tracing of the memory references made by application threads. Two placement schemes, modeling both single-level and multi-level latencies, allocate pages near processors that most frequently access that memory page. These schemes leverage performance monitoring capabilities of contemporary microprocessors to efficiently extract an approximate trace of memory accesses. This information is used to decide page affinity, i.e., the node to which the page is bound. The method operates entirely in user space, is widely automated, and handles not only static but also dynamic memory allocation. Experiments show that this method, although based on lossy tracing, can efficiently and effectively improve page placement, leading to an average wall-clock execution time saving of over 20% for the tested benchmarks on the SGI Altix with a 2x remote access penalty and 12% on AMD Opterons with a 1.3–2.0x access penalty. This is accompanied by a one-time tracing overhead of 2.7% over the overall original program wallclock time.}, number={12}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Marathe, Jaydeep and Thakkar, Vivek and Mueller, Frank}, year={2010}, month={Dec}, pages={1204–1219} } @article{mohan_mueller_root_hawkins_healy_whalley_vivancos_2010, title={Parametric Timing Analysis and Its Application to Dynamic Voltage Scaling}, volume={10}, ISSN={["1558-3465"]}, DOI={10.1145/1880050.1880061}, abstractNote={Embedded systems with real-time constraints depend on a priori knowledge of worst-case execution times (WCETs) to determine if tasks meet deadlines. Static timing analysis derives bounds on WCETs but requires statically known loop bounds.}, number={2}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Mohan, Sibin and Mueller, Frank and Root, Michael and Hawkins, William and Healy, Christopher and Whalley, David and Vivancos, Emilio}, year={2010}, month={Dec} } @article{ramaprasad_mueller_2010, title={Tightening the Bounds on Feasible Preemptions}, volume={10}, ISSN={["1558-3465"]}, DOI={10.1145/1880050.1880063}, abstractNote={Data caches are an increasingly important architectural feature in most modern computer systems. They help bridge the gap between processor speeds and memory access times. One inherent difficulty of using data caches in a real-time system is the unpredictability of memory accesses, which makes it difficult to calculate worst-case execution times (WCETs) of real-time tasks.}, number={2}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Ramaprasad, Harini and Mueller, Frank}, year={2010}, month={Dec} } @article{scott_engelmann_vallee_naughton_tikotekar_ostrouchov_leangsuksun_naksinehaboon_nassar_paun_et al._2009, title={A Tunable Holistic Resiliency Approach for High-Performance Computing Systems}, volume={44}, ISSN={["1558-1160"]}, DOI={10.1145/1594835.1504227}, abstractNote={In order to address anticipated high failure rates, resiliency characteristics have become an urgent priority for next-generation extreme-scale high-performance computing (HPC) systems. This poster describes our past and ongoing efforts in novel fault resilience technologies for HPC. Presented work includes proactive fault resilience techniques, system and application reliability models and analyses, failure prediction, transparent process- and virtual-machine-level migration, and trade-off models for combining preemptive migration with checkpoint/restart. This poster summarizes our work and puts all individual technologies into context with a proposed holistic fault resilience framework.}, number={4}, journal={ACM SIGPLAN NOTICES}, author={Scott, Stephen L. and Engelmann, Christian and Vallee, Geoffroy R. and Naughton, Thomas and Tikotekar, Anand and Ostrouchov, George and Leangsuksun, Chokchai and Naksinehaboon, Nichamon and Nassar, Raja and Paun, Mihaela and et al.}, year={2009}, month={Apr}, pages={305–306} } @article{wang_zhang_ma_vazhkudai_mueller_2009, title={Improving the availability of supercomputer job input data using temporal replication}, volume={23}, ISSN={1865-2034 1865-2042}, url={http://dx.doi.org/10.1007/s00450-009-0082-8}, DOI={10.1007/s00450-009-0082-8}, abstractNote={Storage systems in supercomputers are a major reason for service interruptions. RAID solutions alone cannot provide sufficient protection as 1) growing average disk recovery times make RAID groups increasingly vulnerable to disk failures during reconstruction, and 2) RAID does not help with higher-level faults such failed I/O nodes. This paper presents a complementary approach based on the observation that files in the supercomputer scratch space are typically accessed by batch jobs whose execution can be anticipated. Therefore, we propose to transparently, selectively, and temporarily replicate “active” job input data by coordinating the parallel file system with the batch job scheduler. We have implemented the temporal replication scheme in the popular Lustre parallel file system and evaluated it with real-cluster experiments. Our results show that the scheme allows for fast online data reconstruction, with a reasonably low overall space and I/O bandwidth overhead.}, number={3-4}, journal={Computer Science - Research and Development}, publisher={Springer Science and Business Media LLC}, author={Wang, Chao and Zhang, Zhe and Ma, Xiaosong and Vazhkudai, Sudharshan S. and Mueller, Frank}, year={2009}, month={May}, pages={149–157} } @article{sarkar_mueller_ramaprasad_mohan_2009, title={Push-Assisted Migration of Real-Time Tasks in Multi-Core Processors}, volume={44}, ISSN={["1558-1160"]}, DOI={10.1145/1543136.1542464}, abstractNote={Multicores are becoming ubiquitous, not only in general-purpose but also embedded computing. This trend is a reflexion of contemporary embedded applications posing steadily increasing demands in processing power. On such platforms, prediction of timing behavior to ensure that deadlines of real-time tasks can be met is becoming increasingly difficult. While real-time multicore scheduling approaches help to assure deadlines based on firm theoretical properties, their reliance on task migration poses a significant challenge to timing predictability in practice. Task migration actually (a) reduces timing predictability for contemporary multicores due to cache warm-up overheads while (b) increasing traffic on the network-on-chip (NoC) interconnect.}, number={7}, journal={ACM SIGPLAN NOTICES}, author={Sarkar, Abhik and Mueller, Frank and Ramaprasad, Harini and Mohan, Sibin}, year={2009}, month={Jul}, pages={80–89} } @article{noeth_ratn_mueller_schulz_supinski_2009, title={ScalaTrace: Scalable compression and replay of communication traces for high-performance computing}, volume={69}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2008.09.001}, abstractNote={Characterizing the communication behavior of large-scale applications is a difficult and costly task due to code/system complexity and long execution times. While many tools to study this behavior have been developed, these approaches either aggregate information in a lossy way through high-level statistics or produce huge trace files that are hard to handle. We contribute an approach that provides orders of magnitude smaller, if not near-constant size, communication traces regardless of the number of nodes while preserving structural information. We introduce intra- and inter-node compression techniques of MPI events that are capable of extracting an application’s communication structure. We further present a replay mechanism for the traces generated by our approach and discuss results of our implementation for BlueGene/L. Given this novel capability, we discuss its impact on communication tuning and beyond. To the best of our knowledge, such a concise representation of MPI traces in a scalable manner combined with deterministic MPI call replay is without any precedent.}, number={8}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Noeth, Michael and Ratn, Prasun and Mueller, Frank and Schulz, Martin and Supinski, Bronis R.}, year={2009}, month={Aug}, pages={696–710} } @article{zhu_mueller_2008, title={Exploiting synchronous and asynchronous DVS for feedback EDF scheduling on an embedded platform}, volume={7}, ISSN={["1558-3465"]}, DOI={10.1145/1324969.1324972}, abstractNote={Contemporary processors support dynamic voltage scaling (DVS) to reduce power consumption by varying processor voltage/frequency dynamically. We develop power-aware feedback--DVS algorithms for hard real-time systems that adapt to dynamically changing workloads. The algorithms lower execution speed while guaranteeing timing constraints. We study energy consumption for synchronous and asynchronous DVS switching on a PowerPC board. Energy, measured via data acquisition, is reduced up to 70% over naïve DVS for our feedback scheme with 24% peak savings over previous algorithms. These results, albeit differing in quantity, confirm trends observed under simulation. They are the first of their kind on an embedded board.}, number={1}, journal={ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS}, author={Zhu, Yifan and Mueller, Frank}, year={2008} } @misc{wilhelm_engblom_ermedahl_holsti_thesing_whalley_bernat_ferdinand_heckmann_mitra_et al._2008, title={The worst-case execution-time problem - Overview of methods and survey of tools}, volume={7}, number={3}, journal={ACM Transactions on Embedded Computing Systems}, author={Wilhelm, R. and Engblom, J. and Ermedahl, A. and Holsti, N. and Thesing, S. and Whalley, D. and Bernat, G. and Ferdinand, C. and Heckmann, R. and Mitra, T. and et al.}, year={2008} } @article{zhu_mueller_2007, title={DVSleak: Combining leakage reduction and voltage scaling in feedback EDF scheduling}, volume={42}, ISSN={["1558-1160"]}, DOI={10.1145/1273444.1254772}, abstractNote={Recent trends in CMOS fabrication have the demand to conserve power of processors. While dynamic voltage scaling (DVS) is effective in reducing dynamic power, microprocessors produced in ever smaller fabrication processes are increasingly dominated bystatic power. For such processors, voltage/frequency pairs below acritical speed result in higher energy per cycle than entering a processor sleep mode. Yet, computational demand above this critical speed is best met by DVS techniques while still conserving power.}, number={7}, journal={ACM SIGPLAN NOTICES}, author={Zhu, Yifan and Mueller, Frank}, year={2007}, month={Jul}, pages={31–40} } @article{coffman_healy_mueller_whalley_2007, title={Generalizing parametric timing analysis}, volume={42}, ISSN={["1558-1160"]}, DOI={10.1145/1273444.1254795}, abstractNote={In the design of real-time and embedded systems, it is important to establish a bound on the worst-case execution time (WCET) of programs to assure via schedulability analysis that deadlines are not missed. Static WCET analysis is performed by a timing analysis tool. This paper describes novel improvements to such a tool, allowing parametric timing analysis to be performed. Parametric timing analyzers receive an upper bound on the number of loop iterations in terms of an expression which is used to create a parametric formula. This parametric formula is later evaluated to determine the WCET based on input values only known at runtime. Effecting a transformation from a numeric to a parametric timing analyzer requires two innovations: 1) a summation solver capable of summation non-constant expressions and 2) a polynomial data structure which can replace integers as the basis for all calculations. Both additions permit other methods of analysis (e.g. caching, pipeline, constraint) to occur simultaneously. Combining these techniques allows our tool to statically bound the WCET for a larger class of benchmarks.}, number={7}, journal={ACM SIGPLAN NOTICES}, author={Coffman, Joel and Healy, Christopher and Mueller, Frank and Whalley, David}, year={2007}, month={Jul}, pages={152–154} } @article{marathe_mueller_mohan_mckee_de supinski_yoo_2007, title={METRIC: Memory tracing via dynamic binary rewriting to identify cache inefficiencies}, volume={29}, ISSN={["1558-4593"]}, DOI={10.1145/1216374.1216380}, abstractNote={With the diverging improvements in CPU speeds and memory access latencies, detecting and removing memory access bottlenecks becomes increasingly important. In this work we present METRIC, a software framework for isolating and understanding such bottlenecks using partial access traces. METRIC extracts access traces from executing programs without special compiler or linker support. We make four primary contributions. First, we present a framework for extracting partial access traces based on dynamic binary rewriting of the executing application. Second, we introduce a novel algorithm for compressing these traces. The algorithm generates constant space representations for regular accesses occurring in nested loop structures. Third, we use these traces for offline incremental memory hierarchy simulation. We extract symbolic information from the application executable and use this to generate detailed source-code correlated statistics including per-reference metrics, cache evictor information, and stream metrics. Finally, we demonstrate how this information can be used to isolate and understand memory access inefficiencies. This illustrates a potential advantage of METRIC over compile-time analysis for sample codes, particularly when interprocedural analysis is required.}, number={2}, journal={ACM TRANSACTIONS ON PROGRAMMING LANGUAGES AND SYSTEMS}, author={Marathe, Jaydeep and Mueller, Frank and Mohan, Tushar and McKee, Sally A. and De Supinski, Bronis R. and Yoo, Andy}, year={2007}, month={Mar} } @article{marathe_mueller_2007, title={Source-code-correlated cache coherence characterization of OpenMP benchmarks}, volume={18}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2007.1058}, abstractNote={Cache coherence in shared-memory multiprocessor systems has been studied mostly from an architecture viewpoint, often by means of aggregating metrics. In many cases, aggregate events provide insufficient information for programmers to understand and optimize the coherence behavior of their applications. A better understanding would be given by source code correlations of not only aggregate events, but also finer granularity metrics directly linked to high-level source code constructs, such as source lines and data structures. In this paper, we explore a novel application-centric approach to studying coherence traffic. We develop a coherence analysis framework based on incremental coherence simulation of actual reference traces. We provide tool support to extract these reference traces and synchronization information from OpenMP threads at runtime using dynamic binary rewriting of the application executable. These traces are fed to ccSIM, our cache-coherence simulator. The novelty of ccSIM lies in its ability to relate low-level cache coherence metrics (such as coherence misses and their causative invalidations) to high-level source code constructs including source code locations and data structures. We explore the degree of freedom in interleaving data traces from different processors and assess simulation accuracy in comparison to metrics obtained from hardware performance counters. Our quantitative results show that: 1) Cache coherence traffic can be simulated with a considerable degree of accuracy for SPMD programs, as the invalidation traffic closely matches the corresponding hardware performance counters. 2) Detailed, high-level coherence statistics are very useful in detecting, isolating, and understanding coherence bottlenecks. We use ccSIM with several well-known benchmarks and find coherence optimization opportunities leading to significant reductions in coherence traffic and savings in wall-clock execution time}, number={6}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Marathe, Jaydeep and Mueller, Frank}, year={2007}, month={Jun}, pages={818–834} } @article{seth_anantaraman_mueller_rotenberg_2006, title={FAST: Frequency-Aware Static Timing Analysis}, volume={5}, number={1}, journal={ACM Transactions on Programming Languages and Systems}, author={Seth, K. and Anantaraman, A. and Mueller, F. and Rotenberg, E.}, year={2006}, pages={200–224} } @article{zhao_kreahling_whalley_healy_mueller_2006, title={Improving WCET by applying worst-case path optimizations}, volume={34}, ISSN={["1573-1383"]}, DOI={10.1007/s11241-006-8643-4}, abstractNote={It is advantageous to perform compiler optimizations that attempt to lower the worst-case execution time (WCET) of an embedded application since tasks with lower WCETs are easier to schedule and more likely to meet their deadlines. Compiler writers in recent years have used profile information to detect the frequently executed paths in a program and there has been considerable effort to develop compiler optimizations to improve these paths in order to reduce the average-case execution time (ACET). In this paper, we describe an approach to reduce the WCET by adapting and applying optimizations designed for frequent paths to the worst-case (WC) paths in an application. Instead of profiling to find the frequent paths, our WCET path optimization uses feedback from a timing analyzer to detect the WC paths in a function. Since these path-based optimizations may increase code size, the subsequent effects on the WCET due to these optimizations are measured to ensure that the worst-case path optimizations actually improve the WCET before committing to a code size increase. We evaluate these WC path optimizations and present results showing the decrease in WCET versus the increase in code size.}, number={2}, journal={REAL-TIME SYSTEMS}, author={Zhao, Wankang and Kreahling, William and Whalley, David and Healy, Christopher and Mueller, Frank}, year={2006}, month={Oct}, pages={129–152} } @article{zhu_mueller_2005, title={Feedback EDF scheduling exploiting hardware-assisted asynchronous dynamic voltage scaling}, volume={40}, ISSN={["1558-1160"]}, DOI={10.1145/1070891.1065939}, abstractNote={Recent processor support for dynamic frequency and voltage scaling (DVS) allows software to affect power consumption by varying execution frequency and supply voltage on the fly. However, processors generally enter a sleep state while transitioning between frequencies/voltages. In this paper, we examine the merits of hardware/software co-design for a feedback DVS algorithm and a novel processor capable of executing instructions during frequency/voltage transitions. We study several power-aware feedback schemes based on earliest-deadline-first (EDF) scheduling that adjust the system behavior dynamically for different workload characteristics. An infrastructure for investigating several hard real-time DVS schemes, including our feedback DVS algorithm, is implemented on an IBM PowerPC 405LP embedded board. Architecture and algorithm overhead is assessed for different DVS schemes. Measurements on the experimentation board provide a quantitative assessment of the potential of energy savings for DVS algorithms as opposed to prior simulation work that could only provide trends. Energy consumption, measured through a data acquisition board, indicates a considerable potential for real-time DVS scheduling algorithms to lower energy up to 64% over the naïve DVS scheme. Our feedback DVS algorithm saves at least as much and often considerably more energy than previous DVS algorithms with peak savings of an additional 24% energy reduction. To the best of our knowledge, this is the first comparative study of real-time DVS algorithms on a concrete micro-architecture and the first evaluation of asynchronous DVS switching.}, number={7}, journal={ACM SIGPLAN NOTICES}, author={Zhu, YF and Mueller, F}, year={2005}, month={Jul}, pages={203–212} } @article{zhu_mueller_2005, title={Feedback EDF scheduling of real-time tasks exploiting dynamic voltage scaling}, volume={31}, ISSN={["1573-1383"]}, DOI={10.1007/s11241-005-2744-3}, abstractNote={Many embedded systems are constrained by limits on power consumption, which are reflected in the design and implementation for conserving their energy utilization. Dynamic voltage scaling (DVS) has become a promising method for embedded systems to exploit multiple voltage and frequency levels and to prolong their battery life. However, pure DVS techniques do not perform well for systems with dynamic workloads where the job execution times vary significantly. In this paper, we present a novel approach combining feedback control with DVS schemes targeting hard real-time systems with dynamic workloads. Our method relies strictly on operating system support by integrating a DVS scheduler and a feedback controller within the earliest-deadline-first (EDF) scheduling algorithm. Each task is divided into two portions. The objective within the first portion is to exploit frequency scaling for the average execution time. Static and dynamic slack is accumulated for each task with slack-passing and preemption handling schemes. The objective within the second portion is to meet the hard real-time deadline requirements up to the worst-case execution time following a last-chance approach. Feedback control techniques make the system capable of selecting the right frequency and voltage settings for the first portion, as well as guaranteeing hard real-time requirements for the overall task. A feedback control model is given to describe our feedback DVS scheduler, which is used to analyze the system's stability. Simulation experiments demonstrate the ability of our algorithm to save up to 29% more energy than previous work for task sets with different dynamic workload characteristics.}, number={1-3}, journal={REAL-TIME SYSTEMS}, author={Zhu, YF and Mueller, F}, year={2005}, month={Dec}, pages={33–63} } @article{patil_seth_mueller_2004, title={Compositional static instruction cache simulation}, volume={39}, ISSN={["1558-1160"]}, DOI={10.1145/998300.997183}, abstractNote={Scheduling in hard real-time systems requires a priori knowledge of worst-case execution times (WCET). Obtaining the WCET of a task is a difficult problem. Static timing analysis techniques approach this problem via path analysis, pipeline simulation and cache simulation to derive safe WCET bounds. But such analysis has traditionally been constrained to only small programs due to the complexity of simulation, most notably the complexity of static cache simulation, which requires inter-procedural analysis.This paper describes a novel approach of compositional static cache simulation that alleviates the complexity problem, thereby making static timing analysis feasible for much larger programs than in the past. Specifically, a framework is contributed that facilitates static cache analysis by splitting it into two steps, a module-level analysis and a compositional phase, thus addressing the issue of complexity of inter-procedural analysis for an entire program. The module-level analysis parameterizes the data-flow information in terms of potential evictions from cache due to calls containing conflicting references. The compositional analysis stage uses the result of the parameterized data-flow for each module. Thus, the emphasis here is on handling most of the complexity in the module-level analysis and performing as little analysis as possible at the compositional level. The experimental results for direct-mapped instruction caches show that the compositional analysis framework outperforms prior analysis methods for larger programs by one to two orders of magnitude, depending on the reference for comparison, while providing equally accurate predictions. This novel approach to static cache analysis provides a promising solution to the complexity problem in timing analysis, which, for the first time, makes the analysis of larger programs feasible.}, number={7}, journal={ACM SIGPLAN NOTICES}, author={Patil, K and Seth, K and Mueller, F}, year={2004}, month={Jul}, pages={136–145} } @article{anantaraman_seth_rotenberg_mueller_2004, title={Enforcing safety of real-time schedules on contemporary processors using a virtual simple architecture (VISA)}, ISBN={["0-7695-2247-5"]}, ISSN={["1052-8725"]}, DOI={10.1109/real.2004.19}, abstractNote={Determining safe and tight upper bounds on the worst-case execution time (WCET) of hard real-time tasks running on contemporary microarchitectures is a difficult problem. Current trends in microarchitecture design have created a complexity wall: by enhancing performance through ever more complex architectural features, systems have become increasingly hard to analyze. This paper extends a framework, introduced previously as virtual simple architecture (VISA), to multitasking real-time systems. The objective of VISA is to obviate the need to statically analyze complex processors by instead shifting the burden of guaranteeing deadlines - in part - onto the hardware. The VISA framework exploits a complex processor that ordinarily operates with all of its advanced features enabled, called the complex mode, but which can also be downgraded to a simple mode by gating off the advanced features. A WCET bound is statically derived for a task assuming the simple mode. However, this abstraction is speculatively undermined at run-time by executing the task in the complex mode. The task's progress is continuously gauged to detect anomalous cases in which the complex mode underperforms, in which case the processor switches to the simple mode to explicitly enforce the overall contractual WCET. The processor typically operates in complex mode, generating significant slack, and the VISA safety mechanism ensures bounded timing in atypical cases. Extra slack can be exploited for reducing power consumption and/or enhancing functionality. By extending VISA from single-task to multi-tasking systems, this paper reveals the full extent of VISA'S powerful abstraction capability. Key missing pieces are filled in: (1) preserving integrity of the gauging mechanism despite disruptions caused by preemptions; (2) demonstrating compatibility with arbitrary scheduling and dynamic voltage scaling (DVS) policies; (3) formally describing VISA speculation overheads in terms of padding tasks' WCETs; and (4) developing a systematic method for minimizing these overheads. We also propose a VISA variant that dynamically accrues the slack needed to facilitate speculation in the complex mode, eliminating the need to statically pad WCETs and thereby enabling VISA-style speculation even in highly-utilized systems.}, journal={25TH IEEE INTERNATIONAL REAL-TIME SYSTEMS SYMPOSIUM, PROCEEDINGS}, author={Anantaraman, A and Seth, K and Rotenberg, E and Mueller, F}, year={2004}, pages={114–125} } @article{desai_mueller_2004, title={Scalable hierarchical locking for distributed systems}, volume={64}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2003.10.001}, abstractNote={Middleware components are becoming increasingly important as applications share computational resources in distributed environments, such as high-end clusters with ever larger number of processors, computational grids and increasingly large server farms. One of the main challenges in such environments is to achieve scalability of synchronization. In general, concurrency services arbitrate resource requests in distributed systems. But concurrency protocols currently lack scalability. Adding such guarantees enables resource sharing and computing with distributed objects in systems with a large number of nodes. The objective of our work is to enhance middleware services to provide scalability of synchronization and to support state replication in distributed systems. We have designed and implemented a middleware protocol in support of these objectives. Its essence is a peer-to-peer protocol for multi-mode hierarchical locking, which is applicable to transaction-style processing and distributed agreement. We demonstrate high scalability combined with low response times in high-performance cluster environments. Our technical contribution is a novel, fully decentralized, hierarchical locking protocol to enhance concurrency in distributed resource allocation following the specification of general concurrency services for large-scale data and object repositories. Our experiments on an IBM SP show that the number oF messages approaches an asymptote at 15 node from which point on the message overhead is in the order of 3-9 messages per request, depending on system parameters. At the same time, response times increase linearly with a proportional increase in requests and, consequently, higher concurrency levels. Specifically, in the range of up to 80 nodes, response times under 10 ms are observed for critical sections that are one 25th the size of noncritical code. The high degree of scalability and responsiveness of our protocol is due in large to a high level of concurrency upon resolving requests combined with dynamic path compression for request propagation paths. Our approach is not only applicable to CORBA, its principles are shown to provide benefits to general distributed concurrency services and transaction models. Besides its technical strengths, our approach is intriguing due to its simplicity and its wide applicability, ranging From large-scale clusters to server-style computing.}, number={6}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Desai, N and Mueller, F}, year={2004}, month={Jun}, pages={708–724} } @article{vetter_mueller_2003, title={Communication characteristics of large-scale scientific applications for contemporary cluster architectures}, volume={63}, ISSN={["1096-0848"]}, DOI={10.1016/S0743-7315(03)00104-7}, abstractNote={This paper examines the explicit communication characteristics of several sophisticated scientific applications, which, by themselves, constitute a representative suite of publicly available benchmarks for large cluster architectures. By focusing on the message passing interface (MPI) and by using hardware counters on the microprocessor, we observe each application's inherent behavioral characteristics: point-to-point and collective communication, and floating-point operations. Furthermore, we explore the sensitivities of these characteristics to both problem size and number of processors. Our analysis reveals several striking similarities across our diverse set of applications including the use of collective operations, especially those collectives with very small data payloads. We also highlight a trend of novel applications parting with regimented, static communication patterns in favor of dynamically evolving patterns, as evidenced by our experiments on applications that use implicit linear solvers and adaptive mesh refinement. Overall, our study contributes a better understanding of the requirements of current and emerging paradigms of scientific computing in terms of their computation and communication demands.}, number={9}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Vetter, JS and Mueller, F}, year={2003}, month={Sep}, pages={853–865} } @article{seth_anantaraman_mueller_rotenberg_2003, title={FAST: Frequency-aware static timing analysis}, ISBN={["0-7695-2044-8"]}, DOI={10.1109/real.2003.1253252}, abstractNote={Power is a valuable resource in embedded systems as the lifetime of many such systems is constrained by their battery capacity. Recent advances in processor design have added support for dynamic frequency/voltage scaling (DVS) for saving power. Recent work on real-time scheduling focuses on saving power in static as well as dynamic scheduling environments by exploiting idle and slack due to early task completion for DVS of subsequent tasks. These scheduling algorithms rely on a priori knowledge of worst-case execution times (WCET) for each task. They assume that DVS has no effect on the worst-case execution cycles (WCEC) of a task and scale the WCET according to the processor frequency. However, for systems with memory hierarchies, the WCEC typically does not change under DVS due to frequency modulation. Hence, current assumptions used by DVS schemes result in a highly exaggerated WCET. This paper contributes novel techniques for tight and flexible static timing analysis particularly well-suited for dynamic scheduling schemes. The technical contributions are as follows: (1) we assess the problem of changing execution cycles due to scaling techniques. (2) We propose a parametric approach towards bounding the WCET statically with respect to the frequency. Using a parametric model, we can capture the effect of changes in frequency on the WCEC and thus, accurately model the WCET over any frequency range. (3) We discuss design and implementation of the frequency-aware static timing analysis (FAST) tool based on our prior experience with static timing analysis. (4) We demonstrate in experiments that our FAST tool provides safe upper bounds on the WCET, which are tight. The FAST tool allows us to capture the WCET of six benchmarks using equations that overestimate the WCET by less than 1%. FAST equations can also be used to improve existing DVS scheduling schemes to ensure that the effect of frequency scaling on WCET is considered and that the WCET used is not exaggerated. (5) We leverage three DVS scheduling schemes by incorporating FAST into them and by showing that the power consumption further decreases. To the best of our knowledge, this study of DVS effects on timing analysis is unprecedented.}, journal={RTSS 2003: 24TH IEEE INTERNATIONAL REAL-TIME SYSTEMS SYMPOSIUM, PROCEEDINGS}, author={Seth, K and Anantaraman, A and Mueller, F and Rotenberg, E}, year={2003}, pages={40–51} } @inproceedings{anantaraman_seth_patil_rotenberg_f. mueller_2003, title={Virtual Simple Architecture (VISA): Exceeding the complexity limit in safe real-time systems}, ISBN={1880843374}, booktitle={Computers and their applications :|bproceedings of the ISCA 16th International Conference, Seattle, Washington, USA, March 28-30, 2001}, publisher={Cary, NC: ISCA}, author={Anantaraman, A. and Seth, K. and Patil, K. and Rotenberg, E. and F. Mueller, F.}, year={2003}, pages={350–361} } @article{dudani_mueller_zhu_2002, title={Energy-conserving feedback EDF scheduling for embedded systems with real-time constraints}, volume={37}, ISSN={["1558-1160"]}, DOI={10.1145/566225.513865}, abstractNote={ Embedded systems have limited energy resources. Hence, they should conserve these resources to extend their period of operation. Recently, dynamic frequency scaling (DFS) and dynamic voltage scaling (DVS) have been added to a various embedded processors as a means to increase battery life. A number of scheduling techniques have been developed to exploit DFS and DVS for real-time systems to reduce energy consumption. These techniques exploit idle and slack time of a schedule. Idle time can be consumed by lowering the processor frequency of selected tasks while slack time allows later tasks to execute at lower frequencies with reduced voltage demands.Our work delivers energy savings beyond the level of prior work. We enhance the earliest-deadline first (EDF) scheduling to exploit slack time generated by the invocation of the task at multiple frequency levels within the same invocation . The technique relies strictly on operating system support within the scheduler to implement the approach. Early scaling at a low frequency, determined by a feedback mechanism and facilitated by a slack-passing scheme, capitalizes on high probabilities of a task to finish its execution without utilizing its worst-case execution budget. If a task does not complete at a certain point in time within its low frequency range, the remainder of it continues to execute at a higher frequency. Our experiments demonstrate that the resulting energy savings exceed those of previously published work by up to 33%. In addition, our method only adds a constant complexity at each scheduling point, which has not been achieved by prior work, to the best of our knowledge. }, number={7}, journal={ACM SIGPLAN NOTICES}, author={Dudani, A and Mueller, F and Zhu, YF}, year={2002}, month={Jul}, pages={213–222} } @article{unger_mueller_2002, title={Handling irreducible loops: Optimized node splitting versus DJ-Graphs}, volume={24}, ISSN={["1558-4593"]}, DOI={10.1145/567097.567098}, abstractNote={This paper addresses the question of how to handle irreducible regions during optimization, which has become even more relevant for contemporary processors since recent VLIW-like architectures highly rely on instruction scheduling. The contributions of this paper are twofold. First, a method of optimized node splitting to transform irreducible regions of control flow into reducible regions is formally defined and its correctness is shown. This method is superior to approaches previously published since it reduces the number of replicated nodes by comparison. Second, three methods that handle regions of irreducible control flow are evaluated with respect to their impact on compiler optimizations. First, traditional node splitting is evaluated. Second, optimized node splitting is implemented. Third, DJ-Graphs are utilized to recognize nesting of irreducible (and reducible) loops and apply common loop optimizations extended for irreducible loops. Experiments compare the performance of these approaches with unrecognized irreducible loops that cannot be subject to loop optimizations, which is typical for contemporary compilers. Measurements show improvements of 1 to 40% for these methods of handling irreducible loops over the unoptimized case. Optimized node splitting may be chosen to retrofit existing compilers since it has the advantage that it only requires few changes to an optimizing compiler while limiting the code growth of compiled programs compared to traditional node splitting. Recognizing loops via DJ-Graphs should be chosen for new compiler developments since it requires more changes to the optimizer but does not significantly change the code size of compiled programs while yielding comparable improvements. Handling irreducible loops should even yield more benefits for exploiting instruction-level parallelism of modern architectures in the context of global instruction scheduling and optimization techniques that may introduce irreducible loops, such as enhanced modulo scheduling.}, number={4}, journal={ACM TRANSACTIONS ON PROGRAMMING LANGUAGES AND SYSTEMS}, author={Unger, S and Mueller, F}, year={2002}, month={Jul}, pages={299–333} } @article{wegener_mueller_2001, title={A comparison of static analysis and evolutionary testing for the verification of timing constraints}, volume={21}, ISSN={["1573-1383"]}, DOI={10.1023/A:1011132221066}, number={3}, journal={REAL-TIME SYSTEMS}, author={Wegener, J and Mueller, F}, year={2001}, month={Nov}, pages={241–268} } @book{high-level parallel programming models and supportive environments 6th international workshop, hips 2001, san francisco, ca, usa, april 23, 2001 : proceedings_2001, publisher={New York: Springer}, year={2001} }