@article{guan_ding_shen_krim_2018, title={Reuse-Centric K-Means Configuration}, ISSN={["1084-4627"]}, DOI={10.1109/ICDE.2018.00116}, abstractNote={K-means configuration is a time-consuming process due to the iterative nature of k-means. This paper proposes reuse-centric k-means configuration to accelerate k-means configuration. It is based on the observation that the explorations of different configurations share lots of common or similar computations. Effectively reusing the computations from prior trials of different configurations could largely shorten the configuration time. The paper presents a set of novel techniques to materialize the idea, including reuse-based filtering, center reuse, and a two-phase design to capitalize on the reuse opportunities on three levels: validation, k, and feature sets. Experiments show that our approach can accelerate some common configuration tuning methods by 5-9X.}, journal={2018 IEEE 34TH INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE)}, author={Guan, Hui and Ding, Yufei and Shen, Xipeng and Krim, Hamid}, year={2018}, pages={1224–1227} } @inproceedings{ding_ning_guan_shen_2017, title={Generalizations of the theory and deployment of triangular inequality for compiler-based strength reduction}, volume={52}, DOI={10.1145/3140587.3062377}, abstractNote={Triangular Inequality (TI) has been used in many manual algorithm designs to achieve good efficiency in solving some distance calculation-based problems. This paper presents our generalization of the idea into a compiler optimization technique, named TI-based strength reduction. The generalization consists of three parts. The first is the establishment of the theoretic foundation of this new optimization via the development of a new form of TI named Angular Triangular Inequality, along with several fundamental theorems. The second is the revealing of the properties of the new forms of TI and the proposal of guided TI adaptation, a systematic method to address the difficulties in effective deployments of TI optimizations. The third is an integration of the new optimization technique in an open-source compiler. Experiments on a set of data mining and machine learning algorithms show that the new technique can speed up the standard implementations by as much as 134X and 46X on average for distance-related problems, outperforming previous TI-based optimizations by 2.35X on average. It also extends the applicability of TI-based optimizations to vector related problems, producing tens of times of speedup.}, number={6}, booktitle={ACM SIGPLAN Notices}, author={Ding, Y. F. and Ning, L. and Guan, H. and Shen, Xipeng}, year={2017}, pages={33–48} } @article{chen_ding_shen_2017, title={Sweet KNN: An Efficient KNN on GPU through Reconciliation between Redundancy Removal and Regularity}, ISSN={["1084-4627"]}, DOI={10.1109/icde.2017.116}, abstractNote={Finding the k nearest neighbors of a query point or a set of query points (KNN) is a fundamental problem in many application domains. It is expensive to do. Prior efforts in improving its speed have followed two directions with conflicting considerations: One tries to minimize the redundant distance computations but often introduces irregularities into computations, the other tries to exploit the regularity in computations to best exert the power of GPU-like massively parallel processors, which often introduces even extra distance computations. This work gives a detailed study on how to effectively combine the strengths of both approaches. It manages to reconcile the polar opposite effects of the two directions through elastic algorithmic designs, adaptive runtime configurations, and a set of careful implementation-level optimizations. The efforts finally lead to a new KNN on GPU named Sweet KNN, the first high-performance triangular-inequality-based KNN on GPU that manages to reach a sweet point between redundancy minimization and regularity preservation for various datasets. Experiments on a set of datasets show that Sweet KNN outperforms existing GPU implementations on KNN by up to 120X (11X on average).}, journal={2017 IEEE 33RD INTERNATIONAL CONFERENCE ON DATA ENGINEERING (ICDE 2017)}, author={Chen, Guoyang and Ding, Yufei and Shen, Xipeng}, year={2017}, pages={621–632} } @article{ding_ansel_veeramachaneni_shen_o'reilly_amarasinghe_2015, title={Autotuning Algorithmic Choice for Input Sensitivity}, volume={50}, ISSN={["1558-1160"]}, DOI={10.1145/2813885.2737969}, abstractNote={A daunting challenge faced by program performance autotuning is input sensitivity, where the best autotuned configuration may vary with different input sets. This paper presents a novel two-level input learning algorithm to tackle the challenge for an important class of autotuning problems, algorithmic autotuning. The new approach uses a two-level input clustering method to automatically refine input grouping, feature selection, and classifier construction. Its design solves a series of open issues that are particularly essential to algorithmic autotuning, including the enormous optimization space, complex influence by deep input features, high cost in feature extraction, and variable accuracy of algorithmic choices. Experimental results show that the new solution yields up to a 3x speedup over using a single configuration for all inputs, and a 34x speedup over a traditional one-level method for addressing input sensitivity in program optimizations.}, number={6}, journal={ACM SIGPLAN NOTICES}, author={Ding, Yufei and Ansel, Jason and Veeramachaneni, Kalyan and Shen, Xipeng and O'Reilly, Una-May and Amarasinghe, Saman}, year={2015}, month={Jun}, pages={379–390} } @article{zhao_wu_zhou_ding_sun_shen_wu_2014, title={Call Sequence Prediction through Probabilistic Calling Automata}, volume={49}, ISSN={["1558-1160"]}, DOI={10.1145/2714064.2660221}, abstractNote={Predicting a sequence of upcoming function calls is important for optimizing programs written in modern managed languages (e.g., Java, Javascript, C#.) Existing function call predictions are mainly built on statistical patterns, suitable for predicting a single call but not a sequence of calls. This paper presents a new way to enable call sequence prediction, which exploits program structures through Probabilistic Calling Automata (PCA), a new program representation that captures both the inherent ensuing relations among function calls, and the probabilistic nature of execution paths. It shows that PCA-based prediction outperforms existing predictions, yielding substantial speedup when being applied to guide Just-In-Time compilation. By enabling accurate, efficient call sequence prediction for the first time, PCA-based predictors open up many new opportunities for dynamic program optimizations.}, number={10}, journal={ACM SIGPLAN NOTICES}, author={Zhao, Zhijia and Wu, Bo and Zhou, Mingzhou and Ding, Yufei and Sun, Jianhua and Shen, Xipeng and Wu, Youfeng}, year={2014}, month={Oct}, pages={745–762} }