@article{wang_yu_qiu_jin_mueller_2020, title={BarrierFinder: recognizing ad hoc barriers}, volume={25}, ISSN={["1573-7616"]}, DOI={10.1007/s10664-020-09862-3}, number={6}, journal={EMPIRICAL SOFTWARE ENGINEERING}, author={Wang, Tao and Yu, Xiao and Qiu, Zhengyi and Jin, Guoliang and Mueller, Frank}, year={2020}, month={Nov}, pages={4676–4706} } @article{wang_yu_qiu_jin_mueller_2019, title={BARRIERFINDER: Recognizing Ad Hoc Barriers}, ISSN={["1063-6773"]}, DOI={10.1109/ICSME.2019.00049}, abstractNote={Ad hoc synchronizations are pervasive in multi-threaded programs. Due to their diversity and complexity, understanding the enforced synchronization relationships of ad hoc synchronizations is challenging but crucial to multi-threaded program development and maintenance. Existing techniques can partially detect primitive ad hoc synchronizations, but they cannot recognize complete implementations or infer the enforced synchronization relationships. In this paper, we propose a framework to automatically identify complex ad hoc synchronizations in full and infer their synchronization relationships. We instantiate the framework with a tool called BarrierFinder, which features various techniques, including program slicing and bounded symbolic execution, to efficiently explore the interleaving space of ad hoc synchronizations within multi-threaded programs and collect execution traces. BarrierFinder then uses these traces to characterize ad hoc synchronizations into different types with a focus on recognizing barriers. Our evaluation shows that BarrierFinder is both effective and efficient in doing this, and BarrierFinder is also helpful for programmers to understand the correctness of their implemented ad hoc synchronizations.}, journal={2019 IEEE INTERNATIONAL CONFERENCE ON SOFTWARE MAINTENANCE AND EVOLUTION (ICSME 2019)}, author={Wang, Tao and Yu, Xiao and Qiu, Zhengyi and Jin, Guoliang and Mueller, Frank}, year={2019}, pages={323–327} } @article{wang_jain_beckingsale_boehme_mueller_gamblin_2019, title={FuncyTuner: Auto-tuning Scientific Applications With Per-loop Compilation}, ISSN={["0190-3918"]}, DOI={10.1145/3337821.3337842}, abstractNote={The de facto compilation model for production software compiles all modules of a target program with a single set of compilation flags, typically 02 or 03. Such a per-program compilation strategy may yield sub-optimal executables since programs often have multiple hot loops with diverse code structures and may be better optimized with a per-region compilation model that assembles an optimized executable by combining the best per-region code variants. In this paper, we demonstrate that a naïve greedy approach to per-region compilation often degrades performance in comparison to the 03 baseline. To overcome this problem, we contribute a novel per-loop compilation framework, FuncyTuner, which employs lightweight profiling to collect per-loop timing information, and then utilizes a space-focusing technique to construct a performant executable. Experimental results show that FuncyTuner can reliably improve performance of modern scientific applications on several multi-core architectures by 9.2% to 12.3% and 4.5% to 10.7%(geometric mean, up to 22% on certain program) in comparison to the 03 baseline and prior work, respectively.}, journal={PROCEEDINGS OF THE 48TH INTERNATIONAL CONFERENCE ON PARALLEL PROCESSING (ICPP 2019)}, author={Wang, Tao and Jain, Nikhil and Beckingsale, David and Boehme, David and Mueller, Frank and Gamblin, Todd}, year={2019} }