@article{zhang_zhang_lei_mukherjee_pan_zhao_ding_li_xu_2023, title={Accelerating Dataset Distillation via Model Augmentation}, ISSN={["1063-6919"]}, DOI={10.1109/CVPR52729.2023.01150}, abstractNote={Dataset Distillation (DD), a newly emerging field, aims at generating much smaller but efficient synthetic training datasets from large ones. Existing DD methods based on gradient matching achieve leading performance; however, they are extremely computationally intensive as they require continuously optimizing a dataset among thousands of randomly initialized models. In this paper, we assume that training the synthetic data with diverse models leads to better generalization performance. Thus we propose two model augmentation techniques, i.e. using early-stage models and parameter perturbation to learn an informative synthetic set with significantly reduced training cost. Extensive experiments demonstrate that our method achieves up to 20× speedup and comparable performance on par with state-of-the-art methods.}, journal={2023 IEEE/CVF CONFERENCE ON COMPUTER VISION AND PATTERN RECOGNITION (CVPR)}, author={Zhang, Lei and Zhang, Jie and Lei, Bowen and Mukherjee, Subhabrata and Pan, Xiang and Zhao, Bo and Ding, Caiwen and Li, Yao and Xu, Dongkuan}, year={2023}, pages={11950–11959} } @article{huang_lei_xu_peng_sun_xie_ding_2023, title={Dynamic Sparse Training via Balancing the Exploration-Exploitation Trade-off}, DOI={10.1109/DAC56929.2023.10247716}, abstractNote={Over-parameterization of deep neural networks (DNNs) has shown high prediction accuracy for many applications. Although effective, the large number of parameters hinders its popularity on resource-limited devices and has an outsize environmental impact. Sparse training (using a fixed number of nonzero weights in each iteration) could significantly mitigate the training costs by reducing the model size. However, existing sparse training methods mainly use either random-based or greedy-based drop-and-grow strategies, resulting in local minimal and low accuracy. In this work, to assist explainable sparse training, we propose important weights Exploitation and coverage Exploration to characterize Dynamic Sparse Training (DST-EE), and provide quantitative analysis of these two metrics. We further design an acquisition function and provide the theoretical guarantees for the proposed method and clarify its convergence property. Experimental results show that sparse models (up to 98% sparsity) obtained by our proposed method outperform the SOTA sparse training methods on a wide variety of deep learning tasks. On VGG-19 / CIFAR-100, ResNet-50 / CIFAR-10, ResNet-50 / CIFAR-100, our method has even higher accuracy than dense models. On ResNet-50 / ImageNet, the proposed method has up to 8.2% accuracy improvement compared to SOTA sparse training methods.}, journal={2023 60TH ACM/IEEE DESIGN AUTOMATION CONFERENCE, DAC}, author={Huang, Shaoyi and Lei, Bowen and Xu, Dongkuan and Peng, Hongwu and Sun, Yue and Xie, Mimi and Ding, Caiwen}, year={2023} } @article{tian_gao_zhang_sun_xu_2023, title={Improving long-tailed classification by disentangled variance transfer}, volume={21}, ISSN={["2542-6605"]}, DOI={10.1016/j.iot.2023.100687}, abstractNote={Image classification is very important in the system of internet of things (IoT), and long-tailed distribution data are common in our daily life. Extremely imbalanced classes in long-tailed classification lead to a huge performance gap between training and testing. A number of methods have been proposed to transfer knowledge from head classes to tail classes, which expects to augment semantic information in tail. However, by projecting feature vectors onto classifier vectors, we find that the projection part and the orthogonal part behave differently in testing phase as the number of instances decreases. In order to properly transfer covariance information in long-tailed classification task, we propose a novel class-based covariance transfer method from the perspective of disentangling. Extensive experimental results on CIFAR-10-LT, CIFAR-100-LT, ImageNet-LT and iNaturalist 2018 illustrate the effectiveness of our method, which will further improve the validity of IoT system.}, journal={INTERNET OF THINGS}, author={Tian, Yingjie and Gao, Weizhi and Zhang, Qin and Sun, Pu and Xu, Dongkuan}, year={2023}, month={Apr} } @article{huang_fang_mahmood_lei_xu_lei_sun_xu_wen_ding_2023, title={Neurogenesis Dynamics-inspired Spiking Neural Network Training Acceleration}, DOI={10.1109/DAC56929.2023.10247810}, abstractNote={Biologically inspired Spiking Neural Networks (SNNs) have attracted significant attention for their ability to provide extremely energy-efficient machine intelligence through event-driven operation and sparse activities. As artificial intelligence (AI) becomes ever more democratized, there is an increasing need to execute SNN models on edge devices. Existing works adopt weight pruning to reduce SNN model size and accelerate inference. However, these methods mainly focus on how to obtain a sparse model for efficient inference, rather than training efficiency. To overcome these drawbacks, in this paper, we propose a Neurogenesis Dynamics-inspired Spiking Neural Network training acceleration framework, NDSNN. Our framework is computational efficient and trains a model from scratch with dynamic sparsity without sacrificing model fidelity. Specifically, we design a new drop-and-grow strategy with decreasing number of non-zero weights, to maintain extreme high sparsity and high accuracy. We evaluate NDSNN using VGG-16 and ResNet-19 on CIFAR-10, CIFAR-100 and TinyImageNet. Experimental results show that NDSNN achieves up to 20.52% improvement in accuracy on Tiny-ImageNet using ResNet-19 (with a sparsity of 99%) as compared to other SOTA methods (e.g., Lottery Ticket Hypothesis (LTH), SET-SNN, RigL-SNN). In addition, the training cost of NDSNN is only 40.89% of the LTH training cost on ResNet-19 and 31.35% of the LTH training cost on VGG-16 on CIFAR-10.}, journal={2023 60TH ACM/IEEE DESIGN AUTOMATION CONFERENCE, DAC}, author={Huang, Shaoyi and Fang, Haowen and Mahmood, Kaleel and Lei, Bowen and Xu, Nuo and Lei, Bin and Sun, Yue and Xu, Dongkuan and Wen, Wujie and Ding, Caiwen}, year={2023} } @article{zhang_xu_javaheripi_mukherjee_wu_xia_li_jiang_wang_2023, title={RelKD 2023: International Workshop on Resource-Efficient Learning for Knowledge Discovery}, DOI={10.1145/3580305.3599228}, abstractNote={Modern machine learning techniques, especially deep neural networks, have demonstrated excellent performance for various knowledge discovery and data mining applications. However, the development of many of these techniques still encounters resource constraint challenges in many scenarios, such as limited labeled data (data-level), small model size requirements in real-world computing platforms (model-level), and efficient mapping of the computations to heterogeneous target hardware (system-level). Addressing all of these metrics is critical for the effective and efficient usage of the developed models in a wide variety of real systems, such as large-scale social network analysis, large-scale recommendation systems, and real-time anomaly detection. Therefore, it is desirable to develop efficient learning techniques to tackle challenges of resource limitations from data, model/algorithm, or (and) system/hardware perspectives. The proposed international workshop on "Resource-Efficient Learning for Knowledge Discovery (RelKD 2023)" will provide a great venue for academic researchers and industrial practitioners to share challenges, solutions, and future opportunities of resource-efficient learning.}, journal={PROCEEDINGS OF THE 29TH ACM SIGKDD CONFERENCE ON KNOWLEDGE DISCOVERY AND DATA MINING, KDD 2023}, author={Zhang, Chuxu and Xu, Dongkuan and Javaheripi, Mojan and Mukherjee, Subhabrata and Wu, Lingfei and Xia, Yinglong and Li, Jundong and Jiang, Meng and Wang, Yanzhi}, year={2023}, pages={5901–5902} } @article{zhu_lei_zhang_fang_xie_zhang_xu_2023, title={Rethinking Data Distillation: Do Not Overlook Calibration}, ISSN={["1550-5499"]}, DOI={10.1109/ICCV51070.2023.00455}, abstractNote={Neural networks trained on distilled data often produce over-confident output and require correction by calibration methods. Existing calibration methods such as temperature scaling and mixup work well for networks trained on original large-scale data. However, we find that these methods fail to calibrate networks trained on data distilled from large source datasets. In this paper, we show that distilled data lead to networks that are not calibratable due to (i) a more concentrated distribution of the maximum logits and (ii) the loss of information that is semantically meaningful but unrelated to classification tasks. To address this problem, we propose 1 Masked Temperature Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the limitations of distilled data and achieve better calibration results while maintaining the efficiency of dataset distillation.}, journal={2023 IEEE/CVF INTERNATIONAL CONFERENCE ON COMPUTER VISION, ICCV}, author={Zhu, Dongyao and Lei, Bowen and Zhang, Jie and Fang, Yanbo and Xie, Yiqun and Zhang, Ruqi and Xu, Dongkuan}, year={2023}, pages={4912–4922} } @article{li_mei_li_wei_xu_2023, title={Toward Efficient Traffic Signal Control: Smaller Network Can Do More}, ISSN={["2576-2370"]}, DOI={10.1109/CDC49753.2023.10383879}, abstractNote={Reinforcement learning (RL)-based traffic signal control (TSC) optimizes signal switches through RL agents, adapting to intersection updates. Yet, existing RL-based TSC methods often demand substantial storage and computation resources, impeding real-world implementation. This study introduces a two-stage approach to compress the network, maintaining performance. Firstly, we identify a compact network via a removal-verification strategy. Secondly, pruning yields an even sparser network. In addition, Multi-task RL is adopted for multi-intersection TSC, reducing costs, and boosting performance. Our extensive evaluation shows a compressed network at 1/1432nd of original parameters, with an 11.2% enhancement over the best baseline. This work presents an efficient RL-based TSC solution for real-world contexts, offering insights into challenges and opportunities in the field.}, journal={2023 62ND IEEE CONFERENCE ON DECISION AND CONTROL, CDC}, author={Li, Shuya and Mei, Hao and Li, Jianwei and Wei, Li Hua and Xu, Dongkuan}, year={2023}, pages={8069–8074} } @article{wu_lei_xu_zhou_2023, title={Towards Reliable Rare Category Analysis on Graphs via Individual Calibration}, DOI={10.1145/3580305.3599525}, abstractNote={Rare categories abound in a number of real-world networks and play a pivotal role in a variety of high-stakes applications, including financial fraud detection, network intrusion detection, and rare disease diagnosis. Rare category analysis (RCA) refers to the task of detecting, characterizing, and comprehending the behaviors of minority classes in a highly-imbalanced data distribution. While the vast majority of existing work on RCA has focused on improving the prediction performance, a few fundamental research questions heretofore have received little attention and are less explored: How confident or uncertain is a prediction model in rare category analysis? How can we quantify the uncertainty in the learning process and enable reliable rare category analysis? To answer these questions, we start by investigating miscalibration in existing RCA methods. Empirical results reveal that state-of-the-art RCA methods are mainly over-confident in predicting minority classes and under-confident in predicting majority classes. Motivated by the observation, we propose a novel individual calibration framework, named CALIRARE, for alleviating the unique challenges of RCA, thus enabling reliable rare category analysis. In particular, to quantify the uncertainties in RCA, we develop a node-level uncertainty quantification algorithm to model the overlapping support regions with high uncertainty; to handle the rarity of minority classes in miscalibration calculation, we generalize the distribution-based calibration metric to the instance level and propose the first individual calibration measurement on graphs named Expected Individual Calibration Error (EICE). We perform extensive experimental evaluations on real-world datasets, including rare category characterization and model calibration tasks, which demonstrate the significance of our proposed framework.}, journal={PROCEEDINGS OF THE 29TH ACM SIGKDD CONFERENCE ON KNOWLEDGE DISCOVERY AND DATA MINING, KDD 2023}, author={Wu, Longfeng and Lei, Bowen and Xu, Dongkuan and Zhou, Dawei}, year={2023}, pages={2629–2638} } @article{tang_wang_kong_zhang_li_ding_wang_liang_xu_2023, title={You Need Multiple Exiting: Dynamic Early Exiting for Accelerating Unified Vision Language Model}, ISSN={["1063-6919"]}, DOI={10.1109/CVPR52729.2023.01038}, abstractNote={Large-scale Transformer models bring significant improvements for various downstream vision language tasks with a unified architecture. The performance improvements come with increasing model size, resulting in slow inference speed and increased cost for severing. While some certain predictions benefit from the full computation of the large-scale model, not all of inputs need the same amount of computation to conduct, potentially leading to computation resource waste. To handle this challenge, early exiting is proposed to adaptively allocate computational power in term of input complexity to improve inference efficiency. The existing early exiting strategies usually adopt output confidence based on intermediate layers as a proxy of input complexity to incur the decision of skipping following layers. However, such strategies cannot be applied to encoder in the widely-used unified architecture with both encoder and decoder due to difficulty of output confidence estimation in the encoder layers. It is suboptimal in term of saving computation power to ignore the early exiting in encoder component. To address this issue, we propose a novel early exiting strategy for unified vision language models, which allows to dynamically skip the layers in encoder and decoder simultaneously in term of input layer-wise similarities with multiple times of early exiting, namely MuE. By decomposing the image and text modalities in the encoder, MuE is flexible and can skip different layers in term of modalities, advancing the inference efficiency while minimizing performance drop. Experiments on the SNLI-VE and MS COCO datasets show that the proposed approach MuE can reduce expected inference time by up to 50% and 40% while maintaining 99% and 96% performance respectively.}, journal={2023 IEEE/CVF CONFERENCE ON COMPUTER VISION AND PATTERN RECOGNITION (CVPR)}, author={Tang, Shengkun and Wang, Yaqing and Kong, Zhenglun and Zhang, Tianchi and Li, Yao and Ding, Caiwen and Wang, Yanzhi and Liang, Yi and Xu, Dongkuan}, year={2023}, pages={10781–10791} }