@article{iskhakov_tai_bolotnov_nguyen_merzari_shaver_dinh_2023, title={Data-Driven RANS Turbulence Closures for Forced Convection Flow in Reactor Downcomer Geometry}, volume={3}, ISSN={["1943-7471"]}, url={https://doi.org/10.1080/00295450.2023.2185056}, DOI={10.1080/00295450.2023.2185056}, abstractNote={Recent progress in data-driven turbulence modeling has shown its potential to enhance or replace traditional equation-based Reynolds-averaged Navier-Stokes (RANS) turbulence models. This work utilizes invariant neural network (NN) architectures to model Reynolds stresses and turbulent heat fluxes in forced convection flows (when the models can be decoupled). As the considered flow is statistically one dimensional, the invariant NN architecture for the Reynolds stress model reduces to the linear eddy viscosity model. To develop the data-driven models, direct numerical and RANS simulations in vertical planar channel geometry mimicking a part of the reactor downcomer are performed. Different conditions and fluids relevant to advanced reactors (sodium, lead, unitary-Prandtl number fluid, and molten salt) constitute the training database. The models enabled accurate predictions of velocity and temperature, and compared to the baseline k−τ turbulence model with the simple gradient diffusion hypothesis, do not require tuning of the turbulent Prandtl number. The data-driven framework is implemented in the open-source graphics processing unit–accelerated spectral element solver nekRS and has shown the potential for future developments and consideration of more complex mixed convection flows.}, journal={NUCLEAR TECHNOLOGY}, author={Iskhakov, Arsen S. and Tai, Cheng-Kai and Bolotnov, Igor A. and Nguyen, Tri and Merzari, Elia and Shaver, Dillon R. and Dinh, Nam T.}, year={2023}, month={Mar} } @article{nguyen_becchi_2022, title={A GPU-accelerated Data Transformation Framework Rooted in Pushdown Transducers}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC56025.2022.00038}, abstractNote={With the rise of machine learning and data analytics, the ability to process large and diverse sets of data efficiently has become crucial. Research has shown that data transformation is a key performance bottleneck for applications across a variety of domains, from data analytics to scientific computing. Custom hardware accelerators and GPU implementations targeting specific data transformation tasks can alleviate the problem, but suffer from narrow applicability and lack of generality.To tackle this problem, we propose a GPU-accelerated data transformation engine grounded on pushdown transducers. We define an extended pushdown transducer abstraction (effPDT) that allows expressing a wide range of data transformations in a memory-efficient fashion, and is thus amenable for GPU deployment. The effPDT execution engine utilizes a data streaming model that reduces the application’s memory requirements significantly, facilitating deployment on high- and low-end systems. We showcase our GPU-accelerated engine on a diverse set of transformation tasks covering data encoding/decoding, parsing and querying of structured data, and matrix transformation, and we evaluate it against publicly available CPU and GPU library implementations of the considered data transformation tasks. To understand the benefits of the effPDT abstraction, we extend our data transformation engine to also support finite state transducers (FSTs), we map the considered data transformation tasks on FSTs, and we compare the performance and resource requirements of the FST-based and the effPDT-based implementations.}, journal={2022 IEEE 29TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS, HIPC}, author={Nguyen, Tri and Becchi, Michela}, year={2022}, pages={215–225} } @article{ravi_nguyen_zhou_becchi_2021, title={PILOT: a Runtime System to Manage Multi-tenant GPU Unified Memory Footprint}, ISSN={["1094-7256"]}, DOI={10.1109/HiPC53243.2021.00063}, abstractNote={Concurrent kernel execution on GPU has proven an effective technique to improve system throughput by maximizing the resource utilization. In order to increase programmability and meet the increasing memory requirements of data-intensive applications, current GPUs support Unified Virtual Memory (UVM), which provides a virtual memory abstraction with demand paging. By allowing applications to oversubscribe GPU memory, UVM provides increased opportunities to share GPU resources across applications. However, in the presence of applications with competing memory requirements, GPU sharing can lead to performance degradation due to thrashing. NVIDIA's Multiple Process Service (MPS) offers the capability to space share bare metal GPUs, thereby enabling cluster workload managers, such as Slurm, to share a single GPU across MPI ranks with limited control over resource partitioning. However, it is not possible to preempt, schedule, or throttle a running GPU process through MPS. These features would enable new OS-managed scheduling policies to be implemented for GPU kernels to dynamically handle resource contention and offer consistent performance. The contribution of this paper is two-fold. We first show how memory oversubscription can impact the performance of concurrent GPU applications. Then, we propose three methods to transparently mitigate memory interference through kernel preemption and scheduling policies. To implement our policies, we develop our own runtime system (PILOT) to serve as an alternative to NVIDIA's MPS. In the presence of memory over-subscription, we noticed a dramatic improvement in the overall throughput when using our scheduling policies and runtime hints.}, journal={2021 IEEE 28TH INTERNATIONAL CONFERENCE ON HIGH PERFORMANCE COMPUTING, DATA, AND ANALYTICS (HIPC 2021)}, author={Ravi, John and Nguyen, Tri and Zhou, Huiyang and Becchi, Michela}, year={2021}, pages={442–447} }