@article{pan_mueller_2021, title={NUMA-aware memory coloring for multicore real-time systems}, volume={118}, ISSN={["1873-6165"]}, DOI={10.1016/j.sysarc.2021.102188}, abstractNote={Non-uniform memory access (NUMA) systems are characterized by varying memory latencies so that execution times may become unpredictable in a multicore real-time system. This results in overly conservative scheduling with low utilization due to loose bounds on a task’s worst-case execution time (WCET). This work contributes a controller/node-aware memory coloring (CAMC) allocator inside the Linux kernel for the entire address space to reduce access conflicts and latencies by isolating tasks from one another. CAMC improves timing predictability and performance over Linux’ buddy allocator and prior coloring methods. It provides core isolation with respect to banks and memory controllers for real-time systems. This work is the first to consider multiple memory controllers in real-time systems, combine them with bank coloring, and assess its performance on a NUMA architecture, to the best of our knowledge.}, journal={JOURNAL OF SYSTEMS ARCHITECTURE}, author={Pan, Xing and Mueller, Frank}, year={2021}, month={Sep} } @article{pan_mueller_2019, title={The Colored Refresh Server for DRAM}, ISSN={["1555-0885"]}, DOI={10.1109/ISORC.2019.00015}, abstractNote={Bounding each task's worst-case execution time (WCET) accurately is essential for real-time systems to determine if all deadlines can be met. Yet, access latencies to Dynamic Random Access Memory (DRAM) vary significantly due to DRAM refresh, which blocks access to memory cells. Variations further increase as DRAM density grows. This work contributes the “Colored Refresh Server” (CRS), a uniprocessor scheduling paradigm that partitions DRAM in two distinctly colored groups such that refreshes of one color occur in parallel to the execution of real-time tasks of the other color. By executing tasks in phase with periodic DRAM refreshes with opposing colors, memory requests no longer suffer from refresh interference. Experimental results confirm that refresh overhead is completely hidden and memory throughput enhanced.}, journal={2019 IEEE 22ND INTERNATIONAL SYMPOSIUM ON REAL-TIME DISTRIBUTED COMPUTING (ISORC 2019)}, author={Pan, Xing and Mueller, Frank}, year={2019}, pages={27–34} } @article{pan_mueller_2019, title={The Colored Refresh Server for DRAM}, ISSN={["1052-8725"]}, DOI={10.1109/RTSS46320.2019.00023}, abstractNote={Bounding each task’s worst-case execution time (WCET) accurately is essential for real-time systems to determine if all deadlines can be met. Yet, access latencies to Dynamic Random Access Memory (DRAM) vary significantly due to DRAM refresh, which blocks access to memory cells. Variations further increase as DRAM density grows. This work contributes the "Colored Refresh Server" (CRS), a uniprocessor scheduling paradigm that partitions DRAM in two distinctly colored groups such that refreshes of one color occur in parallel to the execution of real-time tasks of the other color. By executing tasks in phase with periodic DRAM refreshes with opposing colors, memory requests no longer suffer from refresh interference. Experimental results confirm that refresh overhead is completely hidden and memory throughput enhanced.}, journal={2019 IEEE 40TH REAL-TIME SYSTEMS SYMPOSIUM (RTSS 2019)}, author={Pan, Xing and Mueller, Frank}, year={2019}, pages={146–153} } @article{pan_mueller_2018, title={Controller-Aware Memory Coloring for Multicore Real-Time Systems}, DOI={10.1145/3167132.3167196}, abstractNote={Memory latencies vary in non-uniform memory access (NUMA) systems so that execution times may become unpredictable in a multicore real-time system. This results in overly conservative scheduling with low utilization due to loose bounds on the worst-case execution time (WCET) of tasks. This work contributes a controller/node-aware memory coloring (CAMC) allocator inside the Linux kernel for the entire address space to reduce access conflicts and latencies by isolating tasks from one another. CAMC improves timing predictability and performance over Linux' buddy allocator and prior coloring methods. It provides core isolation with respect to banks and memory controllers for real-time systems. To our knowledge, this work is first to consider multiple memory controllers in real-time systems, combine them with bank coloring, and assess its performance on a NUMA architecture.}, journal={33RD ANNUAL ACM SYMPOSIUM ON APPLIED COMPUTING}, author={Pan, Xing and Mueller, Frank}, year={2018}, pages={584–592} } @article{lavrijsen_iancu_pan_2018, title={Improving Network Throughput with Global Communication Reordering}, ISSN={["1530-2075"]}, DOI={10.1109/IPDPS.2018.00036}, abstractNote={We present a methodology to improve network throughput by reordering communication in HPC codes. In contrast to all previous work, our approach does not require any information about network and application communication topology. We implement on-the-fly algorithms that reorder message streams based on statistics inferred during execution. Our intuition is that long operations that occur in ranks that exhibit large execution variability need to be prioritized. We consider two approaches: 1) reorder using statistics of a group of significant ranks; and 2) reorder around an outlier rank. For robustness on noisy systems, our final algorithm combines group and outlier reordering and it allows continuous adaptation of the schedule. We validate on two different networks: Cray Aries and InfiniBand. Micro-benchmarks show that performance between two different schedules of communication can be as high as 74%. Given an initial ordering, our algorithm can recuperate as much as 90% from the potential perfor-mance improvements. When employed in applications, we see improvements as large as 70% in communication times. If interference is present, the algorithm additionally reduces outliers and variance in the communication times.}, journal={2018 32ND IEEE INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS)}, author={Lavrijsen, Wim and Iancu, Costin and Pan, Xing}, year={2018}, pages={266–275} } @article{pan_gownivaripalli_mueller_2016, title={TintMalloc: Reducing Memory Access Divergence via Controller-Aware Coloring}, ISSN={["1530-2075"]}, DOI={10.1109/ipdps.2016.26}, abstractNote={DRAM memory of modern multicores is partitioned into sets, each with its own memory controller governing multiple banks. Accesses can be served in parallel to controllers and banks, but sharing of either between threads results in contention that increases latency, and so do accesses to remote controllers due to the non-uniform memory access (NUMA) design. Above DRAM, a last-level cache (LLC), typically at level 3 (L3), is shared by all cores while L1 and L2 caches tend to be core private. This NUMA design inflicts significant variations in execution time for applications with large datasets due to different latencies incurred by remote memory node accesses or contention in LLC and at memory banks/controllers. As a result, single program multiple data (SPMD) applications tend to experience computational imbalance at barriers, which inflicts idle (wait) time for threads that at barriers arrive early and thus impairs effective processor utilization and ultimately performance. This work contributes a novel memory allocator called Tint-Malloc that colors memory at the LLC, bank, and controller level to ensure locality to the local memory node while reducing contention at the LLC/bank levels in software. After adding one line of code during initialization in each thread, existing applications automatically obtain colored heap space through regular malloc calls. Experimental results with the SPEC and Parsec benchmarks show that by choosing disjoint colors per thread, locality is increased, contention is decreased, and overall SPMD execution becomes more balanced atbarriers than default memory allocation under Linux as well as prior coloring approaches.}, journal={2016 IEEE 30TH INTERNATIONAL PARALLEL AND DISTRIBUTED PROCESSING SYMPOSIUM (IPDPS 2016)}, author={Pan, Xing and Gownivaripalli, Yasaswini Jyothi and Mueller, Frank}, year={2016}, pages={363–372} }