@article{jin_liu_huang_he_wu_dai_2024, title={Sign-Based Gradient Descent With Heterogeneous Data: Convergence and Byzantine Resilience}, volume={1}, ISSN={["2162-2388"]}, DOI={10.1109/TNNLS.2023.3345367}, abstractNote={Communication overhead has become one of the major bottlenecks in the distributed training of modern deep neural networks. With such consideration, various quantization-based stochastic gradient descent (SGD) solvers have been proposed and widely adopted, among which signSGD with majority vote shows a promising direction because of its communication efficiency and robustness against Byzantine attackers. However, signSGD fails to converge in the presence of data heterogeneity, which is commonly observed in the emerging federated learning (FL) paradigm. In this article, a sufficient condition for the convergence of the sign-based gradient descent method is derived, based on which a novel magnitude-driven stochastic-sign-based gradient compressor is proposed to address the non-convergence issue of signSGD. The convergence of the proposed method is established in the presence of arbitrary data heterogeneity. The Byzantine resilience of sign-based gradient descent methods is quantified, and the error-feedback mechanism is further incorporated to boost the learning performance Experimental results on the MNIST dataset, the CIFAR-10 dataset, and the Tiny-ImageNet dataset corroborate the effectiveness of the proposed methods.}, journal={IEEE TRANSACTIONS ON NEURAL NETWORKS AND LEARNING SYSTEMS}, author={Jin, Richeng and Liu, Yuding and Huang, Yufan and He, Xiaofan and Wu, Tianfu and Dai, Huaiyu}, year={2024}, month={Jan} } @article{reza_rahmati_wu_dai_2023, title={CGBA: Curvature-aware Geometric Black-box Attack}, ISSN={["1550-5499"]}, DOI={10.1109/ICCV51070.2023.00018}, abstractNote={Decision-based black-box attacks often necessitate a large number of queries to craft an adversarial example. Moreover, decision-based attacks based on querying boundary points in the estimated normal vector direction often suffer from inefficiency and convergence issues. In this paper, we propose a novel query-efficient curvature-aware geometric decision-based black-box attack (CGBA) that conducts boundary search along a semicircular path on a restricted 2D plane to ensure finding a boundary point successfully irrespective of the boundary curvature. While the proposed CGBA attack can work effectively for an arbitrary decision boundary, it is particularly efficient in exploiting the low curvature to craft high-quality adversarial examples, which is widely seen and experimentally verified in commonly used classifiers under non-targeted attacks. In contrast, the decision boundaries often exhibit higher curvature under targeted attacks. Thus, we develop a new query-efficient variant, CGBA-H, that is adapted for the targeted attack. In addition, we further design an algorithm to obtain a better initial boundary point at the expense of some extra queries, which considerably enhances the performance of the targeted attack. Extensive experiments are conducted to evaluate the performance of our proposed methods against some well-known classifiers on the ImageNet and CIFAR10 datasets, demonstrating the superiority of CGBA and CGBA-H over state-of-the-art non-targeted and targeted attacks, respectively. The source code is available at https://github.com/Farhamdur/CGBA.}, journal={2023 IEEE/CVF INTERNATIONAL CONFERENCE ON COMPUTER VISION, ICCV}, author={Reza, Md Farhamdur and Rahmati, Ali and Wu, Tianfu and Dai, Huaiyu}, year={2023}, pages={124–133} } @inproceedings{reza_rahmati_wu_dai_2023, title={CGBA: Curvature-aware Geometric Black-box Attack}, booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, author={Reza, Md Farhamdur and Rahmati, Ali and Wu, Tianfu and Dai, Huaiyu}, year={2023}, pages={124–133} } @article{xue_wu_bai_wang_xia_zhang_torr_2023, title={Holistically-Attracted Wireframe Parsing: From Supervised to Self-Supervised Learning}, volume={45}, ISSN={["1939-3539"]}, url={http://dx.doi.org/10.1109/tpami.2023.3312749}, DOI={10.1109/tpami.2023.3312749}, abstractNote={This article presents Holistically-Attracted Wireframe Parsing (HAWP), a method for geometric analysis of 2D images containing wireframes formed by line segments and junctions. HAWP utilizes a parsimonious Holistic Attraction (HAT) field representation that encodes line segments using a closed-form 4D geometric vector field. The proposed HAWP consists of three sequential components empowered by end-to-end and HAT-driven designs: 1) generating a dense set of line segments from HAT fields and endpoint proposals from heatmaps, 2) binding the dense line segments to sparse endpoint proposals to produce initial wireframes, and 3) filtering false positive proposals through a novel endpoint-decoupled line-of-interest aligning (EPD LOIAlign) module that captures the co-occurrence between endpoint proposals and HAT fields for better verification. Thanks to our novel designs, HAWPv2 shows strong performance in fully supervised learning, while HAWPv3 excels in self-supervised learning, achieving superior repeatability scores and efficient training (24 GPU hours on a single GPU). Furthermore, HAWPv3 exhibits a promising potential for wireframe parsing in out-of-distribution images without providing ground truth labels of wireframes.}, number={12}, journal={IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Xue, Nan and Wu, Tianfu and Bai, Song and Wang, Fu-Dong and Xia, Gui-Song and Zhang, Liangpei and Torr, Philip H. S.}, year={2023}, month={Dec}, pages={14727–14744} } @article{shen_wu_2023, title={LEARNING SPATIALLY-ADAPTIVE SQUEEZE-EXCITATION NETWORKS FOR FEW SHOT IMAGE SYNTHESIS}, DOI={10.1109/ICIP49359.2023.10222248}, abstractNote={Learning light-weight yet expressive deep networks for image synthesis is a challenging problem. Inspired by a recent observation that it is the data-specificity that makes the multi-head self-attention (MHSA) in the Transformer model so powerful, this paper proposes to extend the widely adopted light-weight Squeeze-Excitation (SE) module to be spatially-adaptive to reinforce its data specificity, as a convolutional alternative of the MHSA, while retaining the efficiency of SE and the inductive bias of convolution. It proposes a spatially-adaptive squeeze-excitation (SASE) module for image synthesis task.SASE is tested in low-shot image generative learning task, and shows better performance than prior arts.}, journal={2023 IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING, ICIP}, author={Shen, Jianghao and Wu, Tianfu}, year={2023}, pages={2855–2859} } @article{shen_wu_2023, title={LEARNING SPATIALLY-ADAPTIVE STYLE-MODULATION NETWORKS FOR SINGLE IMAGE SYNTHESIS}, DOI={10.1109/ICIP49359.2023.10222483}, abstractNote={Recently there has been a growing interest in learning generative models from a single image. This task is important as in many real world applications, collecting large dataset is not feasible. Existing work like SinGAN is able to synthesize novel images that resemble the patch distribution of the training image. However, SinGAN cannot learn high level semantics of the image, and thus their synthesized samples tend to have unrealistic spatial layouts. To address this issue, this paper proposes a spatially adaptive style-modulation (SASM) module that learns to preserve realistic spatial configuration of images. Specifically, it extracts style vector (in the form of channel-wise attention) and latent spatial mask (in the form of spatial attention) from a coarse level feature separately. The style vector and spatial mask are then aggregated to modulate features of deeper layers. The disentangled modulation of spatial and style attributes enables the model to preserve the spatial structure of the image without overfitting. Experimental results show that the proposed module learns to generate samples with better fidelity than prior works.}, journal={2023 IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING, ICIP}, author={Shen, Jianghao and Wu, Tianfu}, year={2023}, pages={1455–1459} } @article{xiao_xue_wu_xia_2023, title={Level-S2fM: Structure from Motion on Neural Level Set of Implicit Surfaces}, ISSN={["1063-6919"]}, url={http://dx.doi.org/10.1109/cvpr52729.2023.01650}, DOI={10.1109/cvpr52729.2023.01650}, abstractNote={This paper presents a neural incremental Structure-from-Motion (SfM) approach, Level-S2fM, which estimates the camera poses and scene geometry from a set of uncalibrated images by learning coordinate MLPs for the implicit surfaces and the radiance fields from the established key-point correspondences. Our novel formulation poses some new challenges due to inevitable two-view and few-view configurations in the incremental SfM pipeline, which complicates the optimization of coordinate MLPs for volumetric neural rendering with unknown camera poses. Nevertheless, we demonstrate that the strong inductive basis conveying in the 2D correspondences is promising to tackle those challenges by exploiting the relationship between the ray sampling schemes. Based on this, we revisit the pipeline of incremental SfM and renew the key components, including two-view geometry initialization, the camera poses registration, the 3D points triangulation, and Bundle Adjustment, with a fresh perspective based on neural implicit surfaces. By unifying the scene geometry in small MLP networks through coordinate MLPs, our Level-S2fM treats the zero-level set of the implicit surface as an informative top-down regularization to manage the reconstructed 3D points, reject the outliers in correspondences via querying SDF, and refine the estimated geometries by NBA (Neural BA). Not only does our Level-S2fM lead to promising results on camera pose estimation and scene geometry reconstruction, but it also shows a promising way for neural implicit rendering without knowing camera extrinsic beforehand.}, journal={2023 IEEE/CVF CONFERENCE ON COMPUTER VISION AND PATTERN RECOGNITION (CVPR)}, publisher={IEEE}, author={Xiao, Yuxi and Xue, Nan and Wu, Tianfu and Xia, Gui-Song}, year={2023}, pages={17205–17214} } @inproceedings{liu_zheng_cheng_xue_qi_wu_2023, title={Monocular 3D Object Detection with Bounding Box Denoising in 3D by Perceiver}, booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, author={Liu, Xianpeng and Zheng, Ce and Cheng, Kelvin B and Xue, Nan and Qi, Guo-Jun and Wu, Tianfu}, year={2023}, pages={6436–6446} } @article{tan_xue_wu_xia_2023, title={NOPE-SAC: Neural One-Plane RANSAC for Sparse-View Planar 3D Reconstruction}, volume={45}, ISSN={["1939-3539"]}, url={http://dx.doi.org/10.1109/tpami.2023.3314745}, DOI={10.1109/tpami.2023.3314745}, abstractNote={This article studies the challenging two-view 3D reconstruction problem in a rigorous sparse-view configuration, which is suffering from insufficient correspondences in the input image pairs for camera pose estimation. We present a novel Neural One-PlanE RANSAC framework (termed NOPE-SAC in short) that exerts excellent capability of neural networks to learn one-plane pose hypotheses from 3D plane correspondences. Building on the top of a Siamese network for plane detection, our NOPE-SAC first generates putative plane correspondences with a coarse initial pose. It then feeds the learned 3D plane correspondences into shared MLPs to estimate the one-plane camera pose hypotheses, which are subsequently reweighed in a RANSAC manner to obtain the final camera pose. Because the neural one-plane pose minimizes the number of plane correspondences for adaptive pose hypotheses generation, it enables stable pose voting and reliable pose refinement with a few of plane correspondences for the sparse-view inputs. In the experiments, we demonstrate that our NOPE-SAC significantly improves the camera pose estimation for the two-view inputs with severe viewpoint changes, setting several new state-of-the-art performances on two challenging benchmarks, i.e., MatterPort3D and ScanNet, for sparse-view 3D reconstruction.}, number={12}, journal={IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Tan, Bin and Xue, Nan and Wu, Tianfu and Xia, Gui-Song}, year={2023}, month={Dec}, pages={15233–15248} } @inproceedings{grainger_paniagua_song_cuntoor_lee_wu_2023, title={PaCa-ViT: Learning Patch-to-Cluster Attention in Vision Transformers}, url={http://dx.doi.org/10.1109/cvpr52729.2023.01781}, DOI={10.1109/cvpr52729.2023.01781}, abstractNote={Vision Transformers (ViTs) are built on the assumption of treating image patches as “visual tokens” and learn patch-to-patch attention. The patch embedding based tokenizer has a semantic gap with respect to its counterpart, the textual tokenizer. The patch-to-patch attention suffers from the quadratic complexity issue, and also makes it non-trivial to explain learned ViTs. To address these issues in ViT, this paper proposes to learn Patch-to-Cluster attention (PaCa) in ViT. Queries in our PaCa-ViT starts with patches, while keys and values are directly based on clustering (with a predefined small number of clusters). The clusters are learned end-to-end, leading to better tokenizers and inducing joint clustering-for-attention and attention-for-clustering for better and interpretable models. The quadratic complexity is relaxed to linear complexity. The proposed PaCa module is used in designing efficient and interpretable ViT backbones and semantic segmentation head networks. In experiments, the proposed methods are tested on ImageNet-1k image classification, MS-COCO object detection and instance segmentation and MIT-ADE20k semantic segmentation. Compared with the prior art, it obtains better performance in all the three benchmarks than the SWin [32] and the PVTs [47], [48] by significant margins in ImageNet-1k and MIT-ADE20k. It is also significantly more efficient than PVT models in MS-COCO and MIT-ADE20k due to the linear complexity. The learned clusters are semantically meaningful. Code and model checkpoints are available at https:/github.com/iVMCL/PaCaViT.}, booktitle={2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, publisher={IEEE}, author={Grainger, Ryan and Paniagua, Thomas and Song, Xi and Cuntoor, Naresh and Lee, Mun Wai and Wu, Tianfu}, year={2023}, month={Jun} } @article{paniagua_grainger_wu_2023, title={QuadAttacK: A Quadratic Programming Approach to Learning Ordered Top-K Adversarial Attacks}, journal={Advances in Neural Information Processing Systems}, author={Paniagua, Thomas and Grainger, Ryan and Wu, Tianfu}, year={2023} } @article{kashyap_ravichandiran_wang_baron_wong_wu_franzon_2023, title={Thermal Estimation for 3D-ICs through Generative Networks}, ISSN={["2164-0157"]}, DOI={10.1109/3DIC57175.2023.10154977}, abstractNote={Thermal limitations play a significant role in modern integrated chips (ICs) design and performance. 3D integrated chip (3DIC) makes the thermal problem even worse due to a high density of transistors and heat dissipation bottlenecks within the stack-up. These issues exacerbate the need for quick thermal solutions throughout the design flow. This paper presents a generative approach for modeling the power to heat dissipation for a 3DIC. This approach focuses on a single layer in a stack and shows that, given the power map, the model can generate the resultant heat for the bulk. It shows two approaches, one straightforward approach where the model only uses the power map and the other where it learns the additional parameters through random vectors. The first approach recovers the temperature maps with 1.2 C° or a root-mean-squared error (RMSE) of 0.31 over the images with pixel values ranging from -1 to 1. The second approach performs better, with the RMSE decreasing to 0.082 in a 0 to 1 range. For any result, the model inference takes less than 100 millisecond for any given power map. These results show that the generative approach has speed advantages over traditional solvers while enabling results with reasonable accuracy for 3DIC, opening the door for thermally aware floorplanning.}, journal={2023 IEEE INTERNATIONAL 3D SYSTEMS INTEGRATION CONFERENCE, 3DIC}, author={Kashyap, Priyank and Ravichandiran, Prasanth P. and Wang, Lee and Baron, Dror and Wong, Chau-Wai and Wu, Tianfu and Franzon, Paul D.}, year={2023} } @article{xue_tan_xiao_dong_xia_wu_2023, title={Volumetric Wireframe Parsing from Neural Attraction Fields}, author={Xue, Nan and Tan, Bin and Xiao, Yuxi and Dong, Liang and Xia, Gui-Song and Wu, Tianfu}, year={2023} } @article{ma_tan_xue_wu_zheng_xia_2022, title={HoW-3D: Holistic 3DWireframe Perception from a Single Image}, ISSN={["2475-7888"]}, DOI={10.1109/3DV57658.2022.00070}, abstractNote={This paper studies the problem of holistic 3D wireframe perception (HoW-3D), a new task of perceiving both the visible 3D wireframes and the invisible ones from single-view 2D images. As the non-front surfaces of an object cannot be directly observed in a single view, estimating the nonline-of-sight (NLOS) geometries in HoW-3D is a fundamentally challenging problem and remains open in computer vision. We study the problem of HoW-3D by proposing an ABC-HoW benchmark, which is created on top of CAD models sourced from the ABC-dataset with 12k single-view images and the corresponding holistic 3D wireframe models. With our large-scale ABC-HoW benchmark available, we present a novel Deep Spatial Gestalt (DSG) model to learn the visible junctions and line segments as the basis and then infer the NLOS 3D structures from the visible cues by following the Gestalt principles of human vision systems. In our experiments, we demonstrate that our DSG model performs very well in inferring the holistic 3D wireframes from single-view images. Compared with the strong baseline methods, our DSG model outperforms the previous wire-frame detectors in detecting the invisible line geometry in single-view images and is even very competitive with prior arts that take high-fidelity PointCloud as inputs on reconstructing 3D wireframes.}, journal={2022 INTERNATIONAL CONFERENCE ON 3D VISION, 3DV}, author={Ma, Wenchao and Tan, Bin and Xue, Nan and Wu, Tianfu and Zheng, Xianwei and Xia, Gui-Song}, year={2022}, pages={596–605} } @article{sun_wu_2022, title={Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis}, volume={44}, ISSN={["1939-3539"]}, DOI={10.1109/TPAMI.2021.3078577}, abstractNote={With the remarkable recent progress on learning deep generative models, it becomes increasingly interesting to develop models for controllable image synthesis from reconfigurable structured inputs. This paper focuses on a recently emerged task, layout-to-image, whose goal is to learn generative models for synthesizing photo-realistic images from a spatial layout (i.e., object bounding boxes configured in an image lattice) and its style codes (i.e., structural and appearance variations encoded by latent vectors). This paper first proposes an intuitive paradigm for the task, layout-to-mask-to-image, which learns to unfold object masks in a weakly-supervised way based on an input layout and object style codes. The layout-to-mask component deeply interacts with layers in the generator network to bridge the gap between an input layout and synthesized images. Then, this paper presents a method built on Generative Adversarial Networks (GANs) for the proposed layout-to-mask-to-image synthesis with layout and style control at both image and object levels. The controllability is realized by a proposed novel Instance-Sensitive and Layout-Aware Normalization (ISLA-Norm) scheme. A layout semi-supervised version of the proposed method is further developed without sacrificing performance. In experiments, the proposed method is tested in the COCO-Stuff dataset and the Visual Genome dataset with state-of-the-art performance obtained.}, number={9}, journal={IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE}, author={Sun, Wei and Wu, Tianfu}, year={2022}, month={Jan}, pages={5070–5087} } @article{xue_wu_xia_zhang_2022, title={Learning Local-Global Contextual Adaptation for Multi-Person Pose Estimation}, ISSN={["1063-6919"]}, DOI={10.1109/CVPR52688.2022.01272}, abstractNote={This paper studies the problem of multi-person pose estimation in a bottom-up fashion. With a new and strong observation that the localization issue of the center-offset formulation can be remedied in a local-window search scheme in an ideal situation, we propose a multi-person pose estimation approach, dubbed as LOGO-CAP, by learning the LOcal-GlObal Contextual Adaptation for human Pose. Specifically, our approach learns the keypoint attraction maps (KAMs) from the local keypoints expansion maps (KEMs) in small local windows in the first step, which are subsequently treated as dynamic convolutional kernels on the keypoints-focused global heatmaps for contextual adaptation, achieving accurate multi-person pose estimation. Our method is end-to-end trainable with near real-time inference speed in a single forward pass, obtaining state-of-the-art performance on the COCO keypoint benchmark for bottom-up human pose estimation. With the COCO trained model, our method also outperforms prior arts by a large margin on the challenging OCHuman dataset.}, journal={2022 IEEE/CVF CONFERENCE ON COMPUTER VISION AND PATTERN RECOGNITION (CVPR)}, author={Xue, Nan and Wu, Tianfu and Xia, Gui-Song and Zhang, Liangpei}, year={2022}, pages={13055–13064} } @inproceedings{liu_xue_wu_2022, title={Learning auxiliary monocular contexts helps monocular 3D object detection}, volume={36}, number={2}, booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, author={Liu, Xianpeng and Xue, Nan and Wu, Tianfu}, year={2022}, pages={1810–1818} } @article{kashyap_choi_dey_baron_wong_wu_cheng_franzon_2022, title={Modeling of Adaptive Receiver Performance Using Generative Adversarial Networks}, ISSN={["2377-5726"]}, url={http://dx.doi.org/10.1109/ectc51906.2022.00307}, DOI={10.1109/ECTC51906.2022.00307}, abstractNote={As the development of IBIS Algorithmic Modeling Interface (IBIS-AMI) models gets complex and requires time-consuming simulations, a data-driven and domain-independent approach can have tremendous value. This paper presents a data-driven approach to modeling a high-speed serializer/deserializer (SerDes) receiver through generative adversarial networks (GANs). In this work, the modeling considers multiple channels, random bitstreams, and varying decision feedback equalizer (DFE) tap values to predict an accurate bit error rate (BER) contour plot. We employ a discriminator structure that improves the training to generate a contour plot that makes it difficult to distinguish the ground truth. The generated plots’ bathtub curves strongly correlate to the ground truth bathtub curves and have a root-mean-squared error (RMSE) of 0.014, indicating a good fit.}, journal={IEEE 72ND ELECTRONIC COMPONENTS AND TECHNOLOGY CONFERENCE (ECTC 2022)}, publisher={IEEE}, author={Kashyap, Priyank and Choi, Yongjin and Dey, Sumon and Baron, Dror and Wong, Chau-Wai and Wu, Tianfu and Cheng, Chris and Franzon, Paul D.}, year={2022}, pages={1958–1963} } @article{foster_wu_roberts_bozkurt_2022, title={Preliminary Evaluation of a System with On-Body and Aerial Sensors for Monitoring Working Dogs}, volume={22}, ISSN={["1424-8220"]}, url={https://doi.org/10.3390/s22197631}, DOI={10.3390/s22197631}, abstractNote={This paper presents a system for behavioral, environmental, and physiological monitoring of working dogs using on-body and aerial sensors. The proof of concept study presented here includes two trained dogs performing nine scent detection tasks in an uncontrolled environment encompassing approximately two acres. The dogs were outfitted with a custom designed wearable harness to monitor their heart rate, activity levels and skin temperature. We utilized a commercially available micro-air vehicle to perform aerial sensing by tracking the terrain and movement of the dog in the outdoor space. The dogs were free to explore the space working at maximal speeds to complete a scent-based search-and-retrieval task. Throughout the experiment, the harness data was transferred to a base station via Wi-Fi in real-time. In this work, we also focused on testing the performance of a custom 3D electrode with application specific ergonomic improvements and adaptive filter processing techniques to recover as much electrocardiography data as possible during high intensity motion activity. We were able to recover and use 84% of the collected data where we observed a trend of heart rate generally increasing immediately after successful target localization. For tracking the dogs in the aerial video footage, we applied a state-of-the-art deep learning algorithm designed for online object tracking. Both qualitative and quantitative tracking results are very promising. This study presents an initial effort towards deployment of on-body and aerial sensors to monitor the working dogs and their environments during scent detection and search and rescue tasks in order to ensure their welfare, enable novel dog-machine interfaces, and allow for higher success rate of remote and automated task performance.}, number={19}, journal={SENSORS}, author={Foster, Marc and Wu, Tianfu and Roberts, David L. and Bozkurt, Alper}, year={2022}, month={Oct} } @article{jiang_krim_wu_cansever_2022, title={REFINING SELF-SUPERVISED LEARNING IN IMAGING: BEYOND LINEAR METRIC}, ISSN={["1522-4880"]}, DOI={10.1109/ICIP46576.2022.9897745}, abstractNote={We introduce in this paper a new statistical perspective, exploiting the Jaccard similarity metric, as a measure-based metric to effectively invoke non-linear features in the loss of self-supervised contrastive learning. Specifically, our proposed metric may be interpreted as a dependence measure between two adapted projections learned from the so-called latent representations. This is in contrast to the cosine similarity measure in the conventional contrastive learning model, which accounts for correlation information. To the best of our knowledge, this effectively non-linearly fused information embedded in the Jaccard similarity, is novel to self-supervision learning with promising results. The proposed approach is compared to two state-of-the-art self-supervised contrastive learning methods on three image datasets. We not only demonstrate its amenable applicability in current ML problems, but also its improved performance and training efficiency.}, journal={2022 IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING, ICIP}, author={Jiang, Bo and Krim, Hamid and Wu, Tianfu and Cansever, Derya}, year={2022}, pages={76–80} } @article{cheng_wu_healey_2022, title={Revisiting Non-Parametric Matching Cost Volumes for Robust and Generalizable Stereo Matching}, volume={35}, journal={Advances in Neural Information Processing Systems}, author={Cheng, Kelvin and Wu, Tianfu and Healey, Christopher}, year={2022}, pages={16305–16318} } @article{kashyap_gajjar_choi_wong_baron_wu_cheng_franzon_2022, title={RxGAN: Modeling High-Speed Receiver through Generative Adversarial Networks}, url={http://dx.doi.org/10.1145/3551901.3556480}, DOI={10.1145/3551901.3556480}, abstractNote={Creating models for modern high-speed receivers using circuit-level simulations is costly, as it requires computationally expensive simulations and upwards of months to finalize a model. Added to this is that many models do not necessarily agree with the final hardware they are supposed to emulate. Further, these models are complex due to the presence of various filters, such as a decision feedback equalizer (DFE) and continuous-time linear equalizer (CTLE), which enable the correct operation of the receiver. Other data-driven approaches tackle receiver modeling through multiple models to account for as many configurations as possible. This work proposes a data-driven approach using generative adversarial training to model a real-world receiver with varying DFE and CTLE configurations while handling different channel conditions and bitstreams. The approach is highly accurate as the eye height and width are within 1.59% and 1.12% of the ground truth. The horizontal and vertical bathtub curves match the ground truth and correlate to the ground truth bathtub curves.}, journal={MLCAD '22: PROCEEDINGS OF THE 2022 ACM/IEEE 4TH WORKSHOP ON MACHINE LEARNING FOR CAD (MLCAD)}, publisher={ACM}, author={Kashyap, Priyank and Gajjar, Archit and Choi, Yongjin and Wong, Chau-Wai and Baron, Dror and Wu, Tianfu and Cheng, Chris and Franzon, Paul}, year={2022}, pages={167–172} } @article{li_peng_wu_peng_2021, title={A Bottom-Up and Top-Down Integration Framework for Online Object Tracking}, volume={23}, ISSN={["1941-0077"]}, DOI={10.1109/TMM.2020.2978623}, abstractNote={Robust online object tracking entails integrating short-term memory based trackers and long-term memory based trackers in an elegant framework to handle structural and appearance variations of unknown objects in an online manner. The integration and synergy between short-term and long-term memory based trackers have yet studied well in the literature, especially in pre-training free settings. To address this issue, this paper presents a bottom-up and top-down integration framework. The bottom-up component realizes a data-driven approach for particle generation. It exploits a short-term memory based tracker to generate bounding box proposals in a new frame. In the top-down component, this paper presents a graph regularized sparse coding scheme as the long-term memory based tracker. The over-complete bases for sparse coding are composed of part-based representations learned from earlier tracking results and new observations to form a space with rich temporal context information. A particle graph is computed whose nodes are the bottom-up discriminative particles and edges are formed on-the-fly in terms of appearance and spatial-temporal similarities between particles. The particle graph induces a regularization term in optimizing the sparse coding coefficients for bottom-up particles. In experiments, the proposed method is tested on the widely used OTB-100 benchmark and the VOT2016 benchmark with better performance obtained than baselines including deep learning based trackers. In addition, the outputs from the top-down sparse coding are potentially useful for downstream tasks such as action recognition, multiple-object tracking, and object re-identification.}, journal={IEEE TRANSACTIONS ON MULTIMEDIA}, author={Li, Meihui and Peng, Lingbing and Wu, Tianfu and Peng, Zhenming}, year={2021}, pages={105–119} } @article{roheda_krim_luo_wu_2021, title={Event driven sensor fusion}, volume={188}, ISSN={["1872-7557"]}, DOI={10.1016/j.sigpro.2021.108241}, abstractNote={Multi sensor fusion has long been of interest in target detection and tracking. Different sensors are capable of observing different characteristics about a target, hence, providing additional information toward determining a target’s identity. If used constructively, any additional information should have a positive impact on the performance of the system. In this paper, we consider such a scenario and present a principled approach toward ensuring constructive combination of the various sensors. We look at Decision Level Sensor Fusion under a different light wherein each sensor is said to make a decision on occurrence of certain events that it is capable of observing rather than making a decision on whether a certain target is present. These events are formalized to each sensor according to its potentially extracted attributes to define targets. The proposed technique also explores the extent of dependence between features/events being observed by the sensors, and hence generates more informed probability distributions over the events. In our case, we will study two different datasets. The first one, combines a Radar sensor with an optical sensor for detection of space debris, while the second one combines a seismic sensor with an acoustic sensor in order to detect human and vehicular targets in a field of interest. Provided some additional information about the features of the object, this fusion technique can outperform other existing decision level fusion approaches that may not take into account the relationship between different features. Furthermore, this paper also addresses the issue of coping with damaged sensors when using the model, by learning a hidden space between sensor modalities which can be exploited to safeguard detection performance.}, journal={SIGNAL PROCESSING}, author={Roheda, Siddharth and Krim, Hamid and Luo, Zhi-Quan and Wu, Tianfu}, year={2021}, month={Nov} } @article{kashyap_pitts_baron_wong_wu_franzon_2021, title={High Speed Receiver Modeling Using Generative Adversarial Networks}, ISSN={["2165-4107"]}, DOI={10.1109/EPEPS51341.2021.9609124}, abstractNote={This paper presents a generative approach to modeling a high-speed receiver with a time series input. The model is not built with domain knowledge but learned from a wide range of channel conditions and input bitstreams to generate an eye diagram. The generated eye diagrams are similar to the simulated eye diagrams for the same scenario. We also developed a neural network model to evaluate the generated eye diagram's relevant characteristics, such as eye height and width. The generated eye diagrams are within 7% and 3% error to the ground-truth in eye height and eye width, respectively, based on our evaluation neural network.}, journal={IEEE 30TH CONFERENCE ON ELECTRICAL PERFORMANCE OF ELECTRONIC PACKAGING AND SYSTEMS (EPEPS 2021)}, author={Kashyap, Priyank and Pitts, W. Shepherd and Baron, Dror and Wong, Chau-Wai and Wu, Tianfu and Franzon, Paul D.}, year={2021} } @article{xue_bai_wang_xia_wu_zhang_torr_2021, title={Learning Regional Attraction for Line Segment Detection}, volume={43}, ISSN={["1939-3539"]}, DOI={10.1109/TPAMI.2019.2958642}, abstractNote={This paper presents regional attraction of line segment maps, and hereby poses the problem of line segment detection (LSD) as a problem of region coloring. Given a line segment map, the proposed regional attraction first establishes the relationship between line segments and regions in the image lattice. Based on this, the line segment map is equivalently transformed to an attraction field map (AFM), which can be remapped to a set of line segments without loss of information. Accordingly, we develop an end-to-end framework to learn attraction field maps for raw input images, followed by a squeeze module to detect line segments. Apart from existing works, the proposed detector properly handles the local ambiguity and does not rely on the accurate identification of edge pixels. Comprehensive experiments on the Wireframe dataset and the YorkUrban dataset demonstrate the superiority of our method. In particular, we achieve an F-measure of 0.831 on the Wireframe dataset, advancing the state-of-the-art performance by 10.3 percent.}, number={6}, journal={IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE}, author={Xue, Nan and Bai, Song and Wang, Fu-Dong and Xia, Gui-Song and Wu, Tianfu and Zhang, Liangpei and Torr, Philip H. S.}, year={2021}, month={Jun}, pages={1998–2013} } @article{chen_dutton_ramachandra_wu_vatsavai_2021, title={Local Clustering with Mean Teacher for Semi-supervised learning}, ISSN={["1051-4651"]}, DOI={10.1109/ICPR48806.2021.9412469}, abstractNote={The Mean Teacher (MT) model of Tarvainen and Valpola has shown good performance on several semi-supervised benchmark datasets. MT maintains a teacher model's weights as the exponential moving average of a student model's weights and minimizes the divergence between their probability predictions under diverse perturbations of the inputs. However, MT is known to suffer from confirmation bias, that is, reinforcing incorrect teacher model predictions. In this work, we propose a simple yet effective method called Local Clustering (LC) to mitigate the effect of confirmation bias. In MT, each data point is considered independent of other points during training; however, data points are likely to be close to each other in feature space if they share similar features. Motivated by this, we cluster data points locally by minimizing the pairwise distance between neighboring data points in feature space. Combined with a standard classification cross-entropy objective on labeled data points, the misclassified unlabeled data points are pulled towards high-density regions of their correct class with the help of their neighbors, thus improving model performance. We demonstrate on semi-supervised benchmark datasets SVHN and CIFAR-10 that adding our LC loss to MT yields significant improvements compared to MT and performance comparable to the state of the art in semi-supervised learning11The code is available at: https://github.com/jay1204/local_clustering_with_mt_for_ssl.}, journal={2020 25TH INTERNATIONAL CONFERENCE ON PATTERN RECOGNITION (ICPR)}, author={Chen, Zexi and Dutton, Benjamin and Ramachandra, Bharathkumar and Wu, Tianfu and Vatsavai, Ranga Raju}, year={2021}, pages={6243–6250} } @article{tan_xue_bai_wu_xia_2021, title={PlaneTR: Structure-Guided Transformers for 3D Plane Recovery}, DOI={10.1109/ICCV48922.2021.00415}, abstractNote={This paper presents a neural network built upon Transformers, namely PlaneTR, to simultaneously detect and reconstruct planes from a single image. Different from previous methods, PlaneTR jointly leverages the context information and the geometric structures in a sequence-to-sequence way to holistically detect plane instances in one forward pass. Specifically, we represent the geometric structures as line segments and conduct the network with three main components: (i) context and line segments encoders, (ii) a structure-guided plane decoder, (iii) a pixel-wise plane embedding decoder. Given an image and its detected line segments, PlaneTR generates the context and line segment sequences via two specially designed encoders and then feeds them into a Transformers-based decoder to directly predict a sequence of plane instances by simultaneously considering the context and global structure cues. Finally, the pixel-wise embeddings are computed to assign each pixel to one predicted plane instance which is nearest to it in embedding space. Comprehensive experiments demonstrate that PlaneTR achieves state-of-the-art performance on the ScanNet and NYUv2 datasets.}, journal={2021 IEEE/CVF INTERNATIONAL CONFERENCE ON COMPUTER VISION (ICCV 2021)}, author={Tan, Bin and Xue, Nan and Bai, Song and Wu, Tianfu and Xia, Gui-Song}, year={2021}, pages={4166–4175} } @article{cai_wu_lu_prieto_rosenbaum_stringer_jiang_2021, title={Quantitative Study on Error Sensitivity in Ultrasound Probe Calibration with Hybrid Tracking}, ISSN={["1948-5719"]}, DOI={10.1109/IUS52206.2021.9593708}, abstractNote={Three-dimensional (3D) freehand ultrasound (US) imaging enabled by the external tracking system requires an accurate calibration process to transform the tracked motion information from the markers to the US frames. The previously proposed phantomless calibration method can be further improved using both optical tracking and image-based tracking. This study provides a quantitative analysis on the error sensitivity before implementing the image-based tracking during the calibration process. A linear relationship was found between the perturbation in imaging plane motion estimation and the error caused in the calibration solution. The error to perturbation ratio was within 0.5 in most cases and can reach up to around 0.9 in some poor cases. The overall analysis showed a good error tolerance for the hybrid tracking enabled US probe calibration.}, journal={INTERNATIONAL ULTRASONICS SYMPOSIUM (IEEE IUS 2021)}, author={Cai, Qianqian and Wu, Tianfu and Lu, Jian-yu and Prieto, Juan C. and Rosenbaum, Alan J. and Stringer, Jeffrey S. A. and Jiang, Xiaoning}, year={2021} } @article{xue_wu_bai_wang_xia_zhang_torr_2020, title={Holistically-Attracted Wireframe Parsing}, ISSN={["1063-6919"]}, DOI={10.1109/CVPR42600.2020.00286}, abstractNote={This paper presents a fast and parsimonious parsing method to accurately and robustly detect a vectorized wireframe in an input image with a single forward pass. The proposed method is end-to-end trainable, consisting of three components: (i) line segment and junction proposal generation, (ii) line segment and junction matching, and (iii) line segment and junction verification. For computing line segment proposals, a novel exact dual representation is proposed which exploits a parsimonious geometric reparameterization for line segments and forms a holistic 4-dimensional attraction field map for an input image. Junctions can be treated as the “basins” in the attraction field. The proposed method is thus called Holistically-Attracted Wireframe Parser (HAWP). In experiments, the proposed method is tested on two benchmarks, the Wireframe dataset [14] and the YorkUrban dataset [8]. On both benchmarks, it obtains state-of-the-art performance in terms of accuracy and efficiency. For example, on the Wireframe dataset, compared to the previous state-of-the-art method L-CNN [36], it improves the challenging mean structural average precision (msAP) by a large margin (2.8% absolute improvements), and achieves 29.5 FPS on a single GPU (89% relative improvement). A systematic ablation study is performed to further justify the proposed method.}, journal={2020 IEEE/CVF CONFERENCE ON COMPUTER VISION AND PATTERN RECOGNITION (CVPR)}, author={Xue, Nan and Wu, Tianfu and Bai, Song and Wang, Fudong and Xia, Gui-Song and Zhang, Liangpei and Torr, Philip H. S.}, year={2020}, pages={2785–2794} } @article{li_song_wu_2019, title={AOGNets: Compositional Grammatical Architectures for Deep Learning}, ISSN={["1063-6919"]}, DOI={10.1109/CVPR.2019.00638}, abstractNote={Neural architectures are the foundation for improving performance of deep neural networks (DNNs). This paper presents deep compositional grammatical architectures which harness the best of two worlds: grammar models and DNNs. The proposed architectures integrate compositionality and reconfigurability of the former and the capability of learning rich features of the latter in a principled way. We utilize AND-OR Grammar (AOG) as network generator in this paper and call the resulting networks AOGNets. An AOGNet consists of a number of stages each of which is composed of a number of AOG building blocks. An AOG building block splits its input feature map into N groups along feature channels and then treat it as a sentence of N words. It then jointly realizes a phrase structure grammar and a dependency grammar in bottom-up parsing the “sentence” for better feature exploration and reuse. It provides a unified framework for the best practices developed in state-of-the-art DNNs. In experiments, AOGNet is tested in the ImageNet-1K classification benchmark and the MS-COCO object detection and segmentation benchmark. In ImageNet-1K, AOGNet obtains better performance than ResNet and most of its variants, ResNeXt and its attention based variants such as SENet, DenseNet and DualPathNet. AOGNet also obtains the best model interpretability score using network dissection. AOGNet further shows better potential in adversarial defense. In MS-COCO, AOGNet obtains better performance than the ResNet and ResNeXt backbones in Mask R-CNN.}, journal={2019 IEEE/CVF CONFERENCE ON COMPUTER VISION AND PATTERN RECOGNITION (CVPR 2019)}, author={Li, Xilai and Song, Xi and Wu, Tianfu}, year={2019}, pages={6213–6223} } @article{sun_wu_2019, title={Image Synthesis From Reconfigurable Layout and Style}, ISSN={["1550-5499"]}, DOI={10.1109/ICCV.2019.01063}, abstractNote={Despite remarkable recent progress on both unconditional and conditional image synthesis, it remains a long- standing problem to learn generative models that are capable of synthesizing realistic and sharp images from re- configurable spatial layout (i.e., bounding boxes + class labels in an image lattice) and style (i.e., structural and appearance variations encoded by latent vectors), especially at high resolution. By reconfigurable, it means that a model can preserve the intrinsic one-to-many mapping from a given layout to multiple plausible images with different styles, and is adaptive with respect to perturbations of a layout and style latent code. In this paper, we present a layout- and style-based architecture for generative adversarial networks (termed LostGANs) that can be trained end-to-end to generate images from reconfigurable layout and style. Inspired by the vanilla StyleGAN, the proposed LostGAN consists of two new components: (i) learning fine-grained mask maps in a weakly-supervised manner to bridge the gap between layouts and images, and (ii) learning object instance-specific layout-aware feature normalization (ISLA-Norm) in the generator to realize multi-object style generation. In experiments, the proposed method is tested on the COCO-Stuff dataset and the Visual Genome dataset with state-of-the-art performance obtained. The code and pretrained models are available at https://github.com/iVMCL/LostGANs.}, journal={2019 IEEE/CVF INTERNATIONAL CONFERENCE ON COMPUTER VISION (ICCV 2019)}, author={Sun, Wei and Wu, Tianfu}, year={2019}, pages={10530–10539} } @article{xie_wu_yang_zhang_wu_2019, title={Jointly social grouping and identification in visual dynamics with causality-induced hierarchical Bayesian model}, volume={59}, ISSN={["1095-9076"]}, DOI={10.1016/j.jvcir.2019.01.006}, abstractNote={We concentrate on modeling the person-person interactions for group activity recognition. In order to solve the complexity and ambiguity problems caused by a large number of human objects, we propose a causality-induced hierarchical Bayesian model to tackle the interaction activity video, referring to the “what” interaction activities happen, “where” interaction atomic occurs in spatial, and “when” group interaction happens in temporal. In particular, Granger Causality has been characterized with multiple features to encode the interacting relationships between each individual in the group. Furthermore, to detect and identify the concurrent interactive simultaneously, we investigate the Relative Entropy as a metric to measure the reasonable motion dependency between two arbitrary individuals. Filtered by the causality dependency, causality motion features have been cast as the multiplicative probabilistic ingredients in Bayesian factors to formulate the compact learned latent interaction patterns aggregately that enable the power of discrimination. Experiments demonstrate our model outperforms state-of-the-art models.}, journal={JOURNAL OF VISUAL COMMUNICATION AND IMAGE REPRESENTATION}, author={Xie, Zhao and Wu, Tianfu and Yang, Xingming and Zhang, Luming and Wu, Kewei}, year={2019}, month={Feb}, pages={62–75} } @article{xue_bai_wang_xia_wu_zhang_2019, title={Learning Attraction Field Representation for Robust Line Segment Detection}, ISSN={["1063-6919"]}, DOI={10.1109/CVPR.2019.00169}, abstractNote={This paper presents a region-partition based attraction field dual representation for line segment maps, and thus poses the problem of line segment detection (LSD) as the region coloring problem. The latter is then addressed by learning deep convolutional neural networks (ConvNets) for accuracy, robustness and efficiency. For a 2D line segment map, our dual representation consists of three components: (i) A region-partition map in which every pixel is assigned to one and only one line segment; (ii) An attraction field map in which every pixel in a partition region is encoded by its 2D projection vector w.r.t. the associated line segment; and (iii) A squeeze module which squashes the attraction field to a line segment map that almost perfectly recovers the input one. By leveraging the duality, we learn ConvNets to compute the attraction field maps for raw in-put images, followed by the squeeze module for LSD, in an end-to-end manner. Our method rigorously addresses several challenges in LSD such as local ambiguity and class imbalance. Our method also harnesses the best practices developed in ConvNets based semantic segmentation methods such as the encoder-decoder architecture and the a-trous convolution. In experiments, our method is tested on the WireFrame dataset and the YorkUrban dataset with state-of-the-art performance obtained. Especially, we advance the performance by 4.5 percents on the WireFramedataset. Our method is also fast with 6.6∼10.4 FPS, outperforming most of existing line segment detectors.}, journal={2019 IEEE/CVF CONFERENCE ON COMPUTER VISION AND PATTERN RECOGNITION (CVPR 2019)}, author={Xue, Nan and Bai, Song and Wang, Fudong and Xia, Gui-Song and Wu, Tianfu and Zhang, Liangpei}, year={2019}, pages={1595–1603} } @article{li_xiong_wu_zhou_zhang_chu_2019, title={Neural Abstract Style Transfer for Chinese Traditional Painting}, volume={11362}, ISBN={["978-3-030-20889-9"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-030-20890-5_14}, abstractNote={Chinese traditional painting is one of the most historical artworks in the world. It is very popular in Eastern and Southeast Asia due to being aesthetically appealing. Compared with western artistic painting, it is usually more visually abstract and textureless. Recently, neural network based style transfer methods have shown promising and appealing results which are mainly focused on western painting. It remains a challenging problem to preserve abstraction in neural style transfer. In this paper, we present a Neural Abstract Style Transfer method for Chinese traditional painting. It learns to preserve abstraction and other style jointly end-to-end via a novel MXDoG-guided filter (Modified version of the eXtended Difference-of-Gaussians) and three fully differentiable loss terms. To the best of our knowledge, there is little work study on neural style transfer of Chinese traditional painting. To promote research on this direction, we collect a new dataset with diverse photo-realistic images and Chinese traditional paintings (The dataset will be released at https://github.com/lbsswu/Chinese_style_transfer .). In experiments, the proposed method shows more appealing stylized results in transferring the style of Chinese traditional painting than state-of-the-art neural style transfer methods.}, journal={COMPUTER VISION - ACCV 2018, PT II}, author={Li, Bo and Xiong, Caiming and Wu, Tianfu and Zhou, Yu and Zhang, Lun and Chu, Rufeng}, year={2019}, pages={212–227} } @article{wu_song_2019, title={Towards Interpretable Object Detection by Unfolding Latent Structures}, ISSN={["1550-5499"]}, DOI={10.1109/ICCV.2019.00613}, abstractNote={This paper first proposes a method of formulating model interpretability in visual understanding tasks based on the idea of unfolding latent structures. It then presents a case study in object detection using popular two-stage region-based convolutional network (i.e., R-CNN) detection systems. The proposed method focuses on weakly-supervised extractive rationale generation, that is learning to unfold latent discriminative part configurations of object instances automatically and simultaneously in detection without using any supervision for part configurations. It utilizes a top-down hierarchical and compositional grammar model embedded in a directed acyclic AND-OR Graph (AOG) to explore and unfold the space of latent part configurations of regions of interest (RoIs). It presents an AOGParsing operator that seamlessly integrates with the RoIPooling/RoIAlign operator widely used in R-CNN and is trained end-to-end. In object detection, a bounding box is interpreted by the best parse tree derived from the AOG on-the-fly, which is treated as the qualitatively extractive rationale generated for interpreting detection. In experiments, Faster R-CNN is used to test the proposed method on the PASCAL VOC 2007 and the COCO 2017 object detection datasets. The experimental results show that the proposed method can compute promising latent structures without hurting the performance. The code and pretrained models are available at https://github.com/iVMCL/iRCNN.}, journal={2019 IEEE/CVF INTERNATIONAL CONFERENCE ON COMPUTER VISION (ICCV 2019)}, author={Wu, Tianfu and Song, Xi}, year={2019}, pages={6032–6042} } @article{asadi_ramshankar_pullagurla_bhandare_shanbhag_mehta_kundu_han_lobaton_wu_2018, title={Vision-based integrated mobile robotic system for real-time applications in construction}, volume={96}, ISSN={0926-5805}, url={http://dx.doi.org/10.1016/J.AUTCON.2018.10.009}, DOI={10.1016/J.AUTCON.2018.10.009}, abstractNote={To increase the degree of automation and frequency of data collection for monitoring construction sites, there has been a rapid increase in the number of studies, in the past few years, that developed and/or examined mobile robotic applications in construction. These vision-based platforms capable of autonomous navigation and scene understanding are becoming essential in many construction applications, namely construction sites surveying, work-in-progress monitoring, and existing structure inspection. Simultaneous Localization and Mapping (SLAM) and object recognition for proper context-aware motion planning are some of the core vision techniques that are driving innovation for these robotic systems. To characterize the limitations of current techniques on real-time performance and identify challenges in integration and implementation for construction applications, this paper proposes a mobile robotic platform that incorporates a stack of embedded platforms with integrated Graphical Processing Units (GPUs). This paper presents three case studies to evaluate the performance of the proposed system. The results demonstrate the robustness and feasibility of developing and deploying an autonomous system in the near future.}, journal={Automation in Construction}, publisher={Elsevier BV}, author={Asadi, Khashayar and Ramshankar, Hariharan and Pullagurla, Harish and Bhandare, Aishwarya and Shanbhag, Suraj and Mehta, Pooja and Kundu, Spondon and Han, Kevin and Lobaton, Edgar and Wu, Tianfu}, year={2018}, month={Dec}, pages={470–482} } @article{wu_lu_zhu_2017, title={Online Object Tracking, Learning and Parsing with And-Or Graphs}, volume={39}, ISSN={["1939-3539"]}, DOI={10.1109/tpami.2016.2644963}, abstractNote={This paper presents a method, called AOGTracker, for simultaneously tracking, learning and parsing (TLP) of unknown objects in video sequences with a hierarchical and compositional And-Or graph (AOG) representation. The TLP method is formulated in the Bayesian framework with a spatial and a temporal dynamic programming (DP) algorithms inferring object bounding boxes on-the-fly. During online learning, the AOG is discriminatively learned using latent SVM [1] to account for appearance (e.g., lighting and partial occlusion) and structural (e.g., different poses and viewpoints) variations of a tracked object, as well as distractors (e.g., similar objects) in background. Three key issues in online inference and learning are addressed: (i) maintaining purity of positive and negative examples collected online, (ii) controling model complexity in latent structure learning, and (iii) identifying critical moments to re-learn the structure of AOG based on its intrackability. The intrackability measures uncertainty of an AOG based on its score maps in a frame. In experiments, our AOGTracker is tested on two popular tracking benchmarks with the same parameter setting: the TB-100/50/CVPR2013 benchmarks [2] , [3] , and the VOT benchmarks [4] —VOT 2013, 2014, 2015 and TIR2015 (thermal imagery tracking). In the former, our AOGTracker outperforms state-of-the-art tracking algorithms including two trackers based on deep convolutional network   [5] , [6] . In the latter, our AOGTracker outperforms all other trackers in VOT2013 and is comparable to the state-of-the-art methods in VOT2014, 2015 and TIR2015.}, number={12}, journal={IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE}, author={Wu, Tianfu and Lu, Yang and Zhu, Song-Chun}, year={2017}, month={Dec}, pages={2465–2480} } @article{zhao_wu_wu_wang_2017, title={Zero-Shot Learning posed as a Missing Data Problem}, ISSN={["2473-9936"]}, DOI={10.1109/iccvw.2017.310}, abstractNote={This paper presents a method of zero-shot learning (ZSL) which poses ZSL as the missing data problem, rather than the missing label problem. Specifically, most existing ZSL methods focus on learning mapping functions from the image feature space to the label embedding space. Whereas, the proposed method explores a simple yet effective transductive framework in the reverse way - our method estimates data distribution of unseen classes in the image feature space by transferring knowledge from the label embedding space. Following the transductive setting, we leverage unlabeled data to refine the initial estimation. In experiments, our method achieves the highest classification accuracies on two popular datasets, namely, 96.00% on AwA and 60.24% on CUB.}, journal={2017 IEEE INTERNATIONAL CONFERENCE ON COMPUTER VISION WORKSHOPS (ICCVW 2017)}, author={Zhao, Bo and Wu, Botong and Wu, Tianfu and Wang, Yizhou}, year={2017}, pages={2616–2622} } @article{zhu_wu_zhu_yang_zhang_2016, title={A Reconfigurable Tangram Model for Scene Representation and Categorization}, volume={25}, ISSN={1057-7149 1941-0042}, url={http://dx.doi.org/10.1109/tip.2015.2498407}, DOI={10.1109/tip.2015.2498407}, abstractNote={This paper presents a hierarchical and compositional scene layout (i.e., spatial configuration) representation and a method of learning reconfigurable model for scene categorization. Three types of shape primitives (i.e., triangle, parallelogram, and trapezoid), called tans, are used to tile scene image lattice in a hierarchical and compositional way, and a directed acyclic AND-OR graph (AOG) is proposed to organize the overcomplete dictionary of tan instances placed in image lattice, exploring a very large number of scene layouts. With certain off-the-shelf appearance features used for grounding terminal-nodes (i.e., tan instances) in the AOG, a scene layout is represented by the globally optimal parse tree learned via a dynamic programming algorithm from the AOG, which we call tangram model. Then, a scene category is represented by a mixture of tangram models discovered with an exemplar-based clustering method. On basis of the tangram model, we address scene categorization in two aspects: 1) building a tangram bank representation for linear classifiers, which utilizes a collection of tangram models learned from all categories and 2) building a tangram matching kernel for kernel-based classification, which accounts for all hidden spatial configurations in the AOG. In experiments, our methods are evaluated on three scene data sets for both the configuration-level and semantic-level scene categorization, and outperform the spatial pyramid model consistently.}, number={1}, journal={IEEE Transactions on Image Processing}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Zhu, Jun and Wu, Tianfu and Zhu, Song-Chun and Yang, Xiaokang and Zhang, Wenjun}, year={2016}, month={Jan}, pages={150–166} } @article{li_sun_wu_wang_2016, title={Face Detection with End-to-End Integration of a ConvNet and a 3D Model}, volume={9907}, ISBN={["978-3-319-46486-2"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-319-46487-9_26}, abstractNote={This paper presents a method for face detection in the wild, which integrates a ConvNet and a 3D mean face model in an end-to-end multi-task discriminative learning framework. The 3D mean face model is predefined and fixed (e.g., we used the one provided in the AFLW dataset). The ConvNet consists of two components: (i) The face proposal component computes face bounding box proposals via estimating facial key-points and the 3D transformation (rotation and translation) parameters for each predicted key-point w.r.t. the 3D mean face model. (ii) The face verification component computes detection results by pruning and refining proposals based on facial key-points based configuration pooling. The proposed method addresses two issues in adapting state-of-the-art generic object detection ConvNets (e.g., faster R-CNN) for face detection: (i) One is to eliminate the heuristic design of predefined anchor boxes in the region proposals network (RPN) by exploiting a 3D mean face model. (ii) The other is to replace the generic RoI (Region-of-Interest) pooling layer with a configuration pooling layer to respect underlying object structures. The multi-task loss consists of three terms: the classification Softmax loss and the location smooth $$l_1$$ -losses of both the facial key-points and the face bounding boxes. In experiments, our ConvNet is trained on the AFLW dataset only and tested on the FDDB benchmark with fine-tuning and on the AFW benchmark without fine-tuning. The proposed method obtains very competitive state-of-the-art performance in the two benchmarks.}, journal={COMPUTER VISION - ECCV 2016, PT III}, author={Li, Yunzhu and Sun, Benyuan and Wu, Tianfu and Wang, Yizhou}, year={2016}, pages={420–436} } @article{wu_li_zhu_2016, title={Learning And-Or Model to Represent Context and Occlusion for Car Detection and Viewpoint Estimation}, volume={38}, ISSN={0162-8828 2160-9292 1939-3539}, url={http://dx.doi.org/10.1109/tpami.2015.2497699}, DOI={10.1109/tpami.2015.2497699}, abstractNote={This paper presents a method for learning an And-Or model to represent context and occlusion for car detection and viewpoint estimation. The learned And-Or model represents car-to-car context and occlusion configurations at three levels: (i) spatially-aligned cars, (ii) single car under different occlusion configurations, and (iii) a small number of parts. The And-Or model embeds a grammar for representing large structural and appearance variations in a reconfigurable hierarchy. The learning process consists of two stages in a weakly supervised way (i.e., only bounding boxes of single cars are annotated). First, the structure of the And-Or model is learned with three components: (a) mining multi-car contextual patterns based on layouts of annotated single car bounding boxes, (b) mining occlusion configurations between single cars, and (c) learning different combinations of part visibility based on CAD simulations. The And-Or model is organized in a directed and acyclic graph which can be inferred by Dynamic Programming. Second, the model parameters (for appearance, deformation and bias) are jointly trained using Weak-Label Structural SVM. In experiments, we test our model on four car detection datasets-the KITTI dataset [1] , the PASCAL VOC2007 car dataset [2] , and two self-collected car datasets, namely the Street-Parking car dataset and the Parking-Lot car dataset, and three datasets for car viewpoint estimation-the PASCAL VOC2006 car dataset [2] , the 3D car dataset [3] , and the PASCAL3D+ car dataset [4] . Compared with state-of-the-art variants of deformable part-based models and other methods, our model achieves significant improvement consistently on the four detection datasets, and comparable performance on car viewpoint estimation.}, number={9}, journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Wu, Tianfu and Li, Bo and Zhu, Song-Chun}, year={2016}, month={Sep}, pages={1829–1843} } @article{wu_zhu_2015, title={Learning Near-Optimal Cost-Sensitive Decision Policy for Object Detection}, volume={37}, ISSN={0162-8828 2160-9292}, url={http://dx.doi.org/10.1109/tpami.2014.2359653}, DOI={10.1109/tpami.2014.2359653}, abstractNote={Many popular object detectors, such as AdaBoost, SVM and deformable part-based models (DPM), compute additive scoring functions at a large number of windows in an image pyramid, thus computational efficiency is an important consideration in real time applications besides accuracy. In this paper, a decision policy refers to a sequence of two-sided thresholds to execute early reject and early accept based on the cumulative scores at each step. We formulate an empirical risk function as the weighted sum of the cost of computation and the loss of false alarm and missing detection. Then a policy is said to be cost-sensitive and optimal if it minimizes the risk function. While the risk function is complex due to high-order correlations among the two-sided thresholds, we find that its upper bound can be optimized by dynamic programming efficiently. We show that the upper bound is very tight empirically and thus the resulting policy is said to be near-optimal. In experiments, we show that the decision policy outperforms state-of-the-art cascade methods significantly, in several popular detection tasks and benchmarks, in terms of computational efficiency with similar accuracy of detection.}, number={5}, journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, publisher={Institute of Electrical and Electronics Engineers (IEEE)}, author={Wu, Tianfu and Zhu, Song-Chun}, year={2015}, month={May}, pages={1013–1027} } @article{yu_liu_wu_li_fu_2014, title={A global energy optimization framework for 2.1D sketch extraction from monocular images}, volume={76}, ISSN={1524-0703}, url={http://dx.doi.org/10.1016/j.gmod.2014.03.015}, DOI={10.1016/j.gmod.2014.03.015}, abstractNote={The 2.1D sketch is a layered image representation, which assigns a partial depth ordering of over-segmented regions in a monocular image. This paper presents a global optimization framework for inferring the 2.1D sketch from a monocular image. Our method only uses over-segmented image regions (i.e., superpixels) as input, without any information of objects in the image, since (1) segmenting objects in images is a difficult problem on its own and (2) the objective of our proposed method is to be generic as an initial module useful for downstream high-level vision tasks. This paper formulates the inference of the 2.1D sketch using a global energy optimization framework. The proposed energy function consists of two components: (1) one is defined based on the local partial ordering relations (i.e., figure-ground) between two adjacent over-segmented regions, which captures the marginal information of the global partial depth ordering and (2) the other is defined based on the same depth layer relations among all the over-segmented regions, which groups regions of the same object to account for the over-segmentation issues. A hybrid evolution algorithm is utilized to minimize the global energy function efficiently. In experiments, we evaluated our method on a test data set containing 100 diverse real images from Berkeley segmentation data set (BSDS500) with the annotated ground truth. Experimental results show that our method can infer the 2.1D sketch with high accuracy.}, number={5}, journal={Graphical Models}, publisher={Elsevier BV}, author={Yu, Cheng-Chi and Liu, Yong-Jin and Wu, Matt Tianfu and Li, Kai-Yun and Fu, Xiaolan}, year={2014}, month={Sep}, pages={507–521} } @article{li_song_wu_hu_pei_2014, title={Coupling-and-decoupling: A hierarchical model for occlusion-free object detection}, volume={47}, ISSN={0031-3203}, url={http://dx.doi.org/10.1016/j.patcog.2014.04.016}, DOI={10.1016/j.patcog.2014.04.016}, abstractNote={Handling occlusion is a very challenging problem in object detection. This paper presents a method of learning a hierarchical model for X-to-X occlusion-free object detection (e.g., car-to-car and person-to-person occlusions in our experiments). The proposed method is motivated by an intuitive coupling-and-decoupling strategy. In the learning stage, the pair of occluding X׳s (e.g., car pairs or person pairs) is represented directly and jointly by a hierarchical And–Or directed acyclic graph (AOG) which accounts for the statistically significant co-occurrence (i.e., coupling). The structure and the parameters of the AOG are learned using the latent structural SVM (LSSVM) framework. In detection, a dynamic programming (DP) algorithm is utilized to find the best parse trees for all sliding windows with detection scores being greater than the learned threshold. Then, the two single X׳s are decoupled from the declared detections of X-to-X occluding pairs together with some non-maximum suppression (NMS) post-processing. In experiments, our method is tested on both a roadside-car dataset collected by ourselves (which will be released with this paper) and two public person datasets, the MPII-2Person dataset and the TUD-Crossing dataset. Our method is compared with state-of-the-art deformable part-based methods, and obtains comparable or better detection performance.}, number={10}, journal={Pattern Recognition}, publisher={Elsevier BV}, author={Li, Bo and Song, Xi and Wu, Tianfu and Hu, Wenze and Pei, Mingtao}, year={2014}, month={Oct}, pages={3254–3264} } @inbook{li_wu_zhu_2014, place={Cham, Switzerland}, series={Lecture Notes in Computer Science}, title={Integrating Context and Occlusion for Car Detection by Hierarchical And-Or Model}, ISBN={9783319105987 9783319105994}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-10599-4_42}, DOI={10.1007/978-3-319-10599-4_42}, abstractNote={This paper presents a method of learning reconfigurable hierarchical And-Or models to integrate context and occlusion for car detection. The And-Or model represents the regularities of car-to-car context and occlusion patterns at three levels: (i) layouts of spatially-coupled N cars, (ii) single cars with different viewpoint-occlusion configurations, and (iii) a small number of parts. The learning process consists of two stages. We first learn the structure of the And-Or model with three components: (a) mining N-car contextual patterns based on layouts of annotated single car bounding boxes, (b) mining the occlusion configurations based on the overlapping statistics between single cars, and (c) learning visible parts based on car 3D CAD simulation or heuristically mining latent car parts. The And-Or model is organized into a directed and acyclic graph which leads to the Dynamic Programming algorithm in inference. In the second stage, we jointly train the model parameters (for appearance, deformation and bias) using Weak-Label Structural SVM. In experiments, we test our model on four car datasets: the KITTI dataset [11], the street parking dataset [19], the PASCAL VOC2007 car dataset [7], and a self-collected parking lot dataset. We compare with state-of-the-art variants of deformable part-based models and other methods. Our model obtains significant improvement consistently on the four datasets.}, booktitle={Computer Vision – ECCV 2014}, publisher={Springer International Publishing}, author={Li, Bo and Wu, Tianfu and Zhu, Song-Chun}, editor={Fleet, D. and Pajdla, T. and Schiele, B. and Tuytelaars, T.Editors}, year={2014}, pages={652–667}, collection={Lecture Notes in Computer Science} } @article{barbu_wu_wu_2014, title={Learning mixtures of Bernoulli templates by two-round EM with performance guarantee}, volume={8}, ISSN={1935-7524}, url={http://dx.doi.org/10.1214/14-ejs981}, DOI={10.1214/14-ejs981}, abstractNote={Dasgupta and Shulman showed that a two-round variant of the EM algorithm can learn mixture of Gaussian distributions with near optimal precision with high probability if the Gaussian distributions are well separated and if the dimension is sufficiently high. In this paper, we generalize their theory to learning mixture of high-dimensional Bernoulli templates. Each template is a binary vector, and a template generates examples by randomly switching its binary components independently with a certain probability. In computer vision applications, a binary vector is a feature map of an image, where each binary component indicates whether a local feature or structure is present or absent within a certain cell of the image domain. A Bernoulli template can be considered as a statistical model for images of objects (or parts of objects) from the same category. We show that the two-round EM algorithm can learn mixture of Bernoulli templates with near optimal precision with high probability, if the Bernoulli templates are sufficiently different and if the number of features is sufficiently high. We illustrate the theoretical results by synthetic and real examples.}, number={2}, journal={Electronic Journal of Statistics}, publisher={Institute of Mathematical Statistics}, author={Barbu, Adrian and Wu, Tianfu and Wu, Ying Nian}, year={2014}, pages={3004–3030} } @inbook{li_wu_hu_pei_2013, place={Berlin Heidelberg}, series={Lecture Notes in Computer Science}, title={Coupling-and-Decoupling: A Hierarchical Model for Occlusion-Free Car Detection}, ISBN={9783642373305 9783642373312}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-37331-2_13}, DOI={10.1007/978-3-642-37331-2_13}, abstractNote={Handling occlusions in object detection is a long-standing problem. This paper addresses the problem of X-to-X-occlusion-free object detection (e.g. car-to-car occlusions in our experiment) by utilizing an intuitive coupling-and-decoupling strategy. In the “coupling” stage, we model the pair of occluding X’s (e.g. car pairs) directly to account for the statistically strong co-occurrence (i.e. coupling). Then, we learn a hierarchical And-Or directed acyclic graph (AOG) model under the latent structural SVM (LSSVM) framework. The learned AOG consists of, from the top to bottom, (i) a root Or-node representing different compositions of occluding X pairs, (ii) a set of And-nodes each of which represents a specific composition of occluding X pairs, (iii) another set of And-nodes representing single X’s decomposed from occluding X pairs, and (iv) a set of terminal-nodes which represent the appearance templates for the X pairs, single X’s and latent parts of the single X’s, respectively. The part appearance templates can also be shared among different single X’s. In detection, a dynamic programming (DP) algorithm is used and as a natural consequence we decouple the two single X’s from the X-to-X occluding pairs. In experiments, we test our method on roadside cars which are collected from real traffic video surveillance environment by ourselves. We compare our model with the state-of-the-art deformable part-based model (DPM) and obtain better detection performance.}, booktitle={Computer Vision – ACCV 2012}, publisher={Springer}, author={Li, Bo and Wu, Tianfu and Hu, Wenze and Pei, Mingtao}, editor={Lee, K.M. and Matsushita, Y. and Rehg, J.M. and Hu, Z.Editors}, year={2013}, pages={164–175}, collection={Lecture Notes in Computer Science} } @inbook{xie_pei_liu_wu_2013, place={Berlin Heidelberg}, series={Lecture Notes in Computer Science}, title={Tracking Pedestrian with Multi-component Online Deformable Part-Based Model}, ISBN={9783642374302 9783642374319}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-37431-9_51}, DOI={10.1007/978-3-642-37431-9_51}, abstractNote={In this work we present a novel online algorithm to track pedestrian by integrating both the bottom-up and the top-down models of pedestrian. Motivated by the observation that the appearance of a pedestrian changes a lot in different perspectives or poses, the proposed bottom-up model has multiple components to represent distinct groups of the pedestrian appearances. Also, similar pedestrian appearances have several common salient local patterns and their structure is relatively stable. So, each component of the proposed bottom-up model uses an online deformable part-based model (OLDPM) containing one root and several shared parts to represent the flexible structure and salient local patterns of an appearance. We term the bottom-up model multi-component OLDPM in this paper. We borrow an offline trained class specific pedestrian model [19] as the top-down model. The top-down model is used to extend the bottom-up model with a new OLDPM when a new appearance can’t be covered by the bottom-up model. The multi-component OLDPM has three advantages compared with other models. First, through an incremental support vector machine (INCSVM) [2] associated with the each component, the OLDPM of each component can effectively adapt to the pedestrian appearance variations of a specified perspective and pose. Second, OLDPM can efficiently generate match penalty maps of parts preserving the 2bit binary pattern (2bitBP) [10] through robust real-time pattern matching algorithm [16], and can search over all possible configurations in an image in linear-time by distance transforms algorithm [5]. Last but not least, parts can be shared among components to reduce the computational complexity for matching. We compare our method with four cutting edge tracking algorithms over seven visual sequences and provide quantitative and qualitative performance comparisons.}, booktitle={Computer Vision – ACCV 2012}, publisher={Springer}, author={Xie, Yi and Pei, Mingtao and Liu, Zhao and Wu, Tianfu}, editor={Lee, K.M. and Matsushita, Y. and Rehg, J.M. and Hu, Z.Editors}, year={2013}, pages={664–676}, collection={Lecture Notes in Computer Science} } @article{wu_zhu_2010, title={A Numerical Study of the Bottom-Up and Top-Down Inference Processes in And-Or Graphs}, volume={93}, ISSN={0920-5691 1573-1405}, url={http://dx.doi.org/10.1007/s11263-010-0346-6}, DOI={10.1007/s11263-010-0346-6}, abstractNote={This paper presents a numerical study of the bottom-up and top-down inference processes in hierarchical models using the And-Or graph as an example. Three inference processes are identified for each node A in a recursively defined And-Or graph in which stochastic context sensitive image grammar is embedded: the α(A) process detects node A directly based on image features, the β(A) process computes node A by binding its child node(s) bottom-up and the γ(A) process predicts node A top-down from its parent node(s). All the three processes contribute to computing node A from images in complementary ways. The objective of our numerical study is to explore how much information each process contributes and how these processes should be integrated to improve performance. We study them in the task of object parsing using And-Or graph formulated under the Bayesian framework. Firstly, we isolate and train the α(A), β(A) and γ(A) processes separately by blocking the other two processes. Then, information contributions of each process are evaluated individually based on their discriminative power, compared with their respective human performance. Secondly, we integrate the three processes explicitly for robust inference to improve performance and propose a greedy pursuit algorithm for object parsing. In experiments, we choose two hierarchical case studies: one is junctions and rectangles in low-to-middle-level vision and the other is human faces in high-level vision. We observe that (i) the effectiveness of the α(A), β(A) and γ(A) processes depends on the scale and occlusion conditions, (ii) the α(face) process is stronger than the α processes of facial components, while β(junctions) and β(rectangle) work much better than their α processes, and (iii) the integration of the three processes improves performance in ROC comparisons.}, number={2}, journal={International Journal of Computer Vision}, publisher={Springer Science and Business Media LLC}, author={Wu, Tianfu and Zhu, Song-Chun}, year={2010}, month={May}, pages={226–252} } @article{lin_wu_porway_xu_2009, title={A stochastic graph grammar for compositional object representation and recognition}, volume={42}, ISSN={0031-3203}, url={http://dx.doi.org/10.1016/j.patcog.2008.10.033}, DOI={10.1016/j.patcog.2008.10.033}, abstractNote={This paper illustrates a hierarchical generative model for representing and recognizing compositional object categories with large intra-category variance. In this model, objects are broken into their constituent parts and the variability of configurations and relationships between these parts are modeled by stochastic attribute graph grammars, which are embedded in an And–Or graph for each compositional object category. It combines the power of a stochastic context free grammar (SCFG) to express the variability of part configurations, and a Markov random field (MRF) to represent the pictorial spatial relationships between these parts. As a generative model, different object instances of a category can be realized as a traversal through the And–Or graph to arrive at a valid configuration (like a valid sentence in language, by analogy). The inference/recognition procedure is intimately tied to the structure of the model and follows a probabilistic formulation consisting of bottom-up detection steps for the parts, which in turn recursively activate the grammar rules for top-down verification and searches for missing parts. We present experiments comparing our results to state of art methods and demonstrate the potential of our proposed framework on compositional objects with cluttered backgrounds using training and testing data from the public Lotus Hill and Caltech datasets.}, number={7}, journal={Pattern Recognition}, publisher={Elsevier BV}, author={Lin, Liang and Wu, Tianfu and Porway, Jake and Xu, Zijian}, year={2009}, month={Jul}, pages={1297–1307} }