@article{qian_mueller_2018, title={A Failure Recovery Protocol for Software-Defined Real-Time Networks}, volume={37}, ISSN={["1937-4151"]}, DOI={10.1109/TCAD.2018.2857299}, abstractNote={In a distributed computing environment, real-time tasks communicate via a network infrastructure whose stability significantly impacts timing predictability. Network stability includes two aspects. First, the network has to guarantee the deadline requirements of real-time message transmissions in the absence of network failures. Second, the network needs to support dynamic recovery when network failures occur. This paper generalizes previous static routing approaches, which address the first aspect of the network stability, by developing a dynamic failure recovery policy and a protocol to address the second aspect of the network stability. We derive new real-time forwarding paths without compromising the capability of network devices to guarantee deadlines of concurrent real-time transmissions. We implement this mechanism on a network simulation platform and evaluate it on real hardware in a local cluster to demonstrate its feasibility and effectiveness. Experiments confirm the ability to bound recovery delays based on the network parameters.}, number={11}, journal={IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS}, author={Qian, Tao and Mueller, Frank}, year={2018}, month={Nov}, pages={2222–2232} } @inproceedings{qian_xu_zhang_chakrabortty_mueller_xin_2016, title={A resilient software infrastructure for wide-area measurement systems}, DOI={10.1109/pesgm.2016.7741949}, abstractNote={To support the scalability and resilience requirements of distributed Wide-Area Measurement System (WAMS) architectures, we design and implement a software infrastructure to estimate power grid oscillation modes based on real-time data collected from Phasor Measurement Units (PMUs). This estimation algorithm can be deployed on a hierarchical structure of Phasor Data Concentrators (PDCs), which calculate local estimates and communicate with each other to calculate the global estimate. This work contributes a resilient system to WAMS with guarantees for (1) Quality of Service in network delay, (2) network failure tolerance, and (3) self-recoverability. The core component of the infrastructure is a distributed storage system. Externally, the storage system provides a cloud data lookup service with bounded response times and resilience, which decouples the data communication between PMUs, PDCs, and power-grid monitor/control applications. Internally, the storage system organizes PDCs as storage nodes and employs a real-time task scheduler to order data lookup requests so that urgent requests can be served earlier. To demonstrate the resilience of our distributed system, we deploy the system on a (1) virtual platform and (2) bare-metal machines, where we run a distributed algorithm on the basis of the Prony algorithm and the Alternating Directions Method of Multipliers (ADMM) to estimate the electro-mechanical oscillation modes. We inject different failures into the system to study their impact on the estimation algorithm. Our experiments show that temporary failures of a PDC or a network link do not affect the estimation result since the historical PMU data are cached in the storage system and PDCs can obtain the data on demand.}, booktitle={2016 ieee power and energy society general meeting (pesgm)}, author={Qian, T. and Xu, H. and Zhang, J. H. and Chakrabortty, Aranya and Mueller, F. and Xin, Y. F.}, year={2016} } @article{qian_mueller_xin_2015, title={Hybrid EDF Packet Scheduling for Real-Time Distributed Systems}, ISSN={["1068-3070"]}, DOI={10.1109/ecrts.2015.11}, abstractNote={When multiple computational resource elements collaborate to handle events in a cyber-physical system, scheduling algorithms on these resource elements and the communication delay between them contribute to the overall system utilization and schedulability. Employing earliest deadline first (EDF) scheduling in real-time cyber-physical systems has many challenges. First, the network layer of a resource has to interrupt and notify the scheduler about the deadlines of arrived messages. The randomness of interruption makes context switch costs unpredictable. Second, lack of globally synchronized clocks across resources renders event deadlines derived from local clocks and piggybacked in messages meaningless. Third, communication delay variances in a network increase the unpredictability of the system, e.g., When multiple resources transmit message bursts simultaneously. We address these challenges in this work. First, we combine EDF scheduling with periodic message transmission tasks. Then, we implement an EDF-based packet scheduler, which transmits packets considering event deadlines. Third, we employ bandwidth limitations on the transmission links of resources to decrease network contention and network delay variance. We have implemented our hybrid EDF scheduler in a real-time distributed storage system. We evaluate it on a cluster of nodes in a switched network environment resembling a distributed cyber-physical system to demonstrate the real-time capability of our scheduler.}, journal={PROCEEDINGS OF THE 2015 27TH EUROMICRO CONFERENCE ON REAL-TIME SYSTEMS (ECRTS 2015)}, author={Qian, Tao and Mueller, Frank and Xin, Yufeng}, year={2015}, pages={37–46} } @inproceedings{qian_mueller_xin_2014, title={A real-time distributed hash table}, DOI={10.1109/rtcsa.2014.6910537}, abstractNote={Currently, the North American power grid uses a centralized system to monitor and control wide-area power grid states. This centralized architecture is becoming a bottleneck as large numbers of wind and photo-voltaic (PV) generation sources require real-time monitoring and actuation to ensure sustained reliability. We have designed and implemented a distributed storage system, a real-time distributed hash table (DHT), to store and retrieve this monitoring data as a real-time service to an upper layer decentralized control system. Our real-time DHT utilizes the DHT algorithm Chord in a cyclic executive to schedule data-lookup jobs on distributed storage nodes. We formally define the pattern of the workload on our real-time DHT and use queuing theory to stochastically derive the time bound for response times of these lookup requests. We also define the quality of service (QoS) metrics of our real-time DHT as the probability that deadlines of requests can be met. We use the stochastic model to derive the QoS. An experimental evaluation on distributed nodes shows that our model is well suited to provide time bounds for requests following typical workload patterns and that a prioritized extension can increase the probability of meeting deadlines for subsequent requests.}, booktitle={2014 IEEE 20th International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)}, author={Qian, T. and Mueller, F. and Xin, Y. F.}, year={2014} } @inproceedings{qian_chakrabortty_mueller_xin_2014, title={A real-time distributed storage system for multi-resolution virtual synchrophasor}, ISBN={9781479964154}, url={http://dx.doi.org/10.1109/pesgm.2014.6939832}, DOI={10.1109/pesgm.2014.6939832}, abstractNote={With the continuing large-scale deployment of Phasor Measurement Units (PMU), the Wide-Area Measurement System (WAMS) technology is envisioned to evolve towards a distributed architecture where multiple sets of distributed Phasor Data Concentrators (PDCs) collectively process PMU data to achieve real-time distributed intelligence. Emerging applications developed under this vision will pose stringent but heterogeneous real-time requirements on throughput, delay, and reliability performance of the underlying communication and computing infrastructure. To address this problem, we present a novel virtual PMU (vPMU) architecture that decomposes phasor samples into multiple resolution layers. For a particular receiver with a certain resolution requirement, a complete set of PMU data can be composed by combining samples from the lower layers, without the need for samples from higher layers. We design and implement a real-time distributed storage system to support the virtual PMU data communication. We extend the Chord algorithm so that the response time of data communication can be bounded by our storage system. In addition, we use queuing theory to analyze the response time of requests with our stochastic model.}, booktitle={2014 IEEE PES General Meeting | Conference & Exposition}, publisher={IEEE}, author={Qian, Tao and Chakrabortty, Aranya and Mueller, Frank and Xin, Yufeng}, year={2014}, month={Jul} }