Department of Mathematics
 Search | Help | Login | pdf version | printable version

Math @ Duke





.......................

.......................


Publications of Xiuyuan Cheng    :chronological  alphabetical  combined listing:

%% Papers Published   
@article{fds374290,
   Author = {Repasky, M and Cheng, X and Xie, Y},
   Title = {Neural Stein Critics with Staged L2-Regularization},
   Journal = {IEEE Transactions on Information Theory},
   Volume = {69},
   Number = {11},
   Pages = {7246-7275},
   Year = {2023},
   Month = {November},
   url = {http://dx.doi.org/10.1109/TIT.2023.3299258},
   Abstract = {Learning to differentiate model distributions from observed
             data is a fundamental problem in statistics and machine
             learning, and high-dimensional data remains a challenging
             setting for such problems. Metrics that quantify the
             disparity in probability distributions, such as the Stein
             discrepancy, play an important role in high-dimensional
             statistical testing. In this paper, we investigate the role
             of L2 regularization in training a neural network Stein
             critic so as to distinguish between data sampled from an
             unknown probability distribution and a nominal model
             distribution. Making a connection to the Neural Tangent
             Kernel (NTK) theory, we develop a novel staging procedure
             for the weight of regularization over training time, which
             leverages the advantages of highly-regularized training at
             early times. Theoretically, we prove the approximation of
             the training dynamic by the kernel optimization, namely the
             'lazy training', when the L2 regularization weight is large,
             and training on n samples converge at a rate of O(n-1/2) up
             to a log factor. The result guarantees learning the optimal
             critic assuming sufficient alignment with the leading
             eigen-modes of the zero-time NTK. The benefit of the staged
             L2 regularization is demonstrated on simulated high
             dimensional data and an application to evaluating generative
             models of image data.},
   Doi = {10.1109/TIT.2023.3299258},
   Key = {fds374290}
}

@article{fds374292,
   Author = {Landa, B and Cheng, X},
   Title = {Robust Inference of Manifold Density and Geometry by Doubly
             Stochastic Scaling},
   Journal = {SIAM Journal on Mathematics of Data Science},
   Volume = {5},
   Number = {3},
   Pages = {589-614},
   Publisher = {Society for Industrial & Applied Mathematics
             (SIAM)},
   Year = {2023},
   Month = {September},
   url = {http://dx.doi.org/10.1137/22m1516968},
   Doi = {10.1137/22m1516968},
   Key = {fds374292}
}

@article{fds374293,
   Author = {Lee, J and Xie, Y and Cheng, X},
   Title = {Training Neural Networks for Sequential Change-Point
             Detection},
   Journal = {ICASSP, IEEE International Conference on Acoustics, Speech
             and Signal Processing - Proceedings},
   Volume = {2023-June},
   Year = {2023},
   Month = {January},
   ISBN = {9781728163277},
   url = {http://dx.doi.org/10.1109/ICASSP49357.2023.10095005},
   Abstract = {Detecting an abrupt distributional shift of a data stream,
             known as change-point detection, is a fundamental problem in
             statistics and machine learning. We introduce a novel
             approach for online change-point detection using neural
             net-works. To be specific, our approach is training neural
             net-works to compute the cumulative sum of a detection
             statistic sequentially, which exhibits a significant change
             when a change-point occurs. We demonstrated the superiority
             and potential of the proposed method in detecting
             change-point using both synthetic and real-world
             data.1},
   Doi = {10.1109/ICASSP49357.2023.10095005},
   Key = {fds374293}
}

@article{fds368016,
   Author = {Cheng, X and Wu, N},
   Title = {Eigen-convergence of Gaussian kernelized graph Laplacian by
             manifold heat interpolation},
   Journal = {Applied and Computational Harmonic Analysis},
   Volume = {61},
   Pages = {132-190},
   Year = {2022},
   Month = {November},
   url = {http://dx.doi.org/10.1016/j.acha.2022.06.003},
   Abstract = {We study the spectral convergence of graph Laplacians to the
             Laplace-Beltrami operator when the kernelized graph affinity
             matrix is constructed from N random samples on a
             d-dimensional manifold in an ambient Euclidean space. By
             analyzing Dirichlet form convergence and constructing
             candidate approximate eigenfunctions via convolution with
             manifold heat kernel, we prove eigen-convergence with rates
             as N increases. The best eigenvalue convergence rate is
             N−1/(d/2+2) (when the kernel bandwidth parameter
             ϵ∼(log⁡N/N)1/(d/2+2)) and the best eigenvector 2-norm
             convergence rate is N−1/(d/2+3) (when ϵ∼(log⁡N/N)1/(d/2+3)).
             These rates hold up to a log⁡N-factor for finitely many
             low-lying eigenvalues of both un-normalized and normalized
             graph Laplacians. When data density is non-uniform, we prove
             the same rates for the density-corrected graph Laplacian,
             and we also establish new operator point-wise convergence
             rate and Dirichlet form convergence rate as intermediate
             results. Numerical results are provided to support the
             theory.},
   Doi = {10.1016/j.acha.2022.06.003},
   Key = {fds368016}
}

@article{fds367221,
   Author = {Tan, Y and Zhang, Y and Cheng, X and Zhou, X-H},
   Title = {Statistical inference using GLEaM model with spatial
             heterogeneity and correlation between regions.},
   Journal = {Scientific reports},
   Volume = {12},
   Number = {1},
   Pages = {16630},
   Year = {2022},
   Month = {October},
   url = {http://dx.doi.org/10.1038/s41598-022-18775-8},
   Abstract = {A better understanding of various patterns in the
             coronavirus disease 2019 (COVID-19) spread in different
             parts of the world is crucial to its prevention and control.
             Motivated by the previously developed Global Epidemic and
             Mobility (GLEaM) model, this paper proposes a new stochastic
             dynamic model to depict the evolution of COVID-19. The model
             allows spatial and temporal heterogeneity of transmission
             parameters and involves transportation between regions.
             Based on the proposed model, this paper also designs a
             two-step procedure for parameter inference, which utilizes
             the correlation between regions through a prior distribution
             that imposes graph Laplacian regularization on transmission
             parameters. Experiments on simulated data and real-world
             data in China and Europe indicate that the proposed model
             achieves higher accuracy in predicting the newly confirmed
             cases than baseline models.},
   Doi = {10.1038/s41598-022-18775-8},
   Key = {fds367221}
}

@article{fds368019,
   Author = {Cheng, X and Cloninger, A},
   Title = {Classification logit two-sample testing by neural networks
             for differentiating near manifold densities.},
   Journal = {IEEE transactions on information theory},
   Volume = {68},
   Number = {10},
   Pages = {6631-6662},
   Year = {2022},
   Month = {October},
   url = {http://dx.doi.org/10.1109/tit.2022.3175691},
   Abstract = {The recent success of generative adversarial networks and
             variational learning suggests that training a classification
             network may work well in addressing the classical two-sample
             problem, which asks to differentiate two densities given
             finite samples from each one. Network-based methods have the
             computational advantage that the algorithm scales to large
             datasets. This paper considers using the classification
             logit function, which is provided by a trained
             classification neural network and evaluated on the testing
             set split of the two datasets, to compute a two-sample
             statistic. To analyze the approximation and estimation error
             of the logit function to differentiate near-manifold
             densities, we introduce a new result of near-manifold
             integral approximation by neural networks. We then show that
             the logit function provably differentiates two
             sub-exponential densities given that the network is
             sufficiently parametrized, and for on or near manifold
             densities, the needed network complexity is reduced to only
             scale with the intrinsic dimensionality. In experiments, the
             network logit test demonstrates better performance than
             previous network-based tests using classification accuracy,
             and also compares favorably to certain kernel maximum mean
             discrepancy tests on synthetic datasets and hand-written
             digit datasets.},
   Doi = {10.1109/tit.2022.3175691},
   Key = {fds368019}
}

@article{fds368017,
   Author = {Cheng, X and Wu, H-T},
   Title = {Convergence of graph Laplacian with kNN self-tuned
             kernels},
   Journal = {Information and Inference: A Journal of the
             IMA},
   Volume = {11},
   Number = {3},
   Pages = {889-957},
   Publisher = {Oxford University Press (OUP)},
   Year = {2022},
   Month = {September},
   url = {http://dx.doi.org/10.1093/imaiai/iaab019},
   Abstract = {<jats:title>Abstract</jats:title> <jats:p>Kernelized Gram
             matrix $W$ constructed from data points $\{x_i\}_{i=1}^N$ as
             $W_{ij}= k_0( \frac{ \| x_i - x_j \|^2} {\sigma ^2} ) $ is
             widely used in graph-based geometric data analysis and
             unsupervised learning. An important question is how to
             choose the kernel bandwidth $\sigma $, and a common practice
             called self-tuned kernel adaptively sets a $\sigma _i$ at
             each point $x_i$ by the $k$-nearest neighbor (kNN) distance.
             When $x_i$s are sampled from a $d$-dimensional manifold
             embedded in a possibly high-dimensional space, unlike with
             fixed-bandwidth kernels, theoretical results of graph
             Laplacian convergence with self-tuned kernels have been
             incomplete. This paper proves the convergence of graph
             Laplacian operator $L_N$ to manifold (weighted-)Laplacian
             for a new family of kNN self-tuned kernels $W^{(\alpha
             )}_{ij} = k_0( \frac{ \| x_i - x_j \|^2}{ \epsilon \hat{\rho
             }(x_i) \hat{\rho }(x_j)})/\hat{\rho }(x_i)^\alpha \hat{\rho
             }(x_j)^\alpha $, where $\hat{\rho }$ is the estimated
             bandwidth function by kNN and the limiting operator is also
             parametrized by $\alpha $. When $\alpha = 1$, the limiting
             operator is the weighted manifold Laplacian $\varDelta _p$.
             Specifically, we prove the point-wise convergence of $L_N f
             $ and convergence of the graph Dirichlet form with rates.
             Our analysis is based on first establishing a $C^0$
             consistency for $\hat{\rho }$ which bounds the relative
             estimation error $|\hat{\rho } - \bar{\rho }|/\bar{\rho }$
             uniformly with high probability, where $\bar{\rho } =
             p^{-1/d}$ and $p$ is the data density function. Our
             theoretical results reveal the advantage of the self-tuned
             kernel over the fixed-bandwidth kernel via smaller variance
             error in low-density regions. In the algorithm, no prior
             knowledge of $d$ or data density is needed. The theoretical
             results are supported by numerical experiments on simulated
             data and hand-written digit image data.</jats:p>},
   Doi = {10.1093/imaiai/iaab019},
   Key = {fds368017}
}

@article{fds374296,
   Author = {Xu, C and Cheng, X and Xie, Y},
   Title = {Invertible Neural Networks for Graph Prediction},
   Journal = {IEEE Journal on Selected Areas in Information
             Theory},
   Volume = {3},
   Number = {3},
   Pages = {454-467},
   Publisher = {Institute of Electrical and Electronics Engineers
             (IEEE)},
   Year = {2022},
   Month = {September},
   url = {http://dx.doi.org/10.1109/JSAIT.2022.3221864},
   Abstract = {Graph prediction problems prevail in data analysis and
             machine learning. The inverse prediction problem, namely to
             infer input data from given output labels, is of emerging
             interest in various applications. In this work, we develop
             invertible graph neural network (iGNN), a deep generative
             model to tackle the inverse prediction problem on graphs by
             casting it as a conditional generative task. The proposed
             model consists of an invertible sub-network that maps
             one-to-one from data to an intermediate encoded feature,
             which allows forward prediction by a linear classification
             sub-network as well as efficient generation from output
             labels via a parametric mixture model. The invertibility of
             the encoding sub-network is ensured by a Wasserstein-2
             regularization which allows free-form layers in the residual
             blocks. The model is scalable to large graphs by a
             factorized parametric mixture model of the encoded feature
             and is computationally scalable by using GNN layers. The
             existence of invertible flow mapping is backed by theories
             of optimal transport and diffusion process, and we prove the
             expressiveness of graph convolution layers to approximate
             the theoretical flows of graph data. The proposed iGNN model
             is experimentally examined on synthetic data, including the
             example on large graphs, and the empirical advantage is also
             demonstrated on real-application datasets of solar ramping
             event data and traffic flow anomaly detection.},
   Doi = {10.1109/JSAIT.2022.3221864},
   Key = {fds374296}
}

@article{fds368018,
   Author = {Zhu, W and Qiu, Q and Calderbank, R and Sapiro, G and Cheng,
             X},
   Title = {Scaling-Translation-Equivariant Networks with Decomposed
             Convolutional Filters},
   Journal = {Journal of Machine Learning Research},
   Volume = {23},
   Year = {2022},
   Month = {January},
   Abstract = {Encoding the scale information explicitly into the
             representation learned by a convolutional neural network
             (CNN) is beneficial for many computer vision tasks
             especially when dealing with multiscale inputs. We study, in
             this paper, a scaling-translation-equivariant (ST
             -equivariant) CNN with joint convolutions across the space
             and the scaling group, which is shown to be both sufficient
             and necessary to achieve equivariance for the regular
             representation of the scaling-translation group ST . To
             reduce the model complexity and computational burden, we
             decompose the convolutional filters under two pre-fixed
             separable bases and truncate the expansion to low-frequency
             components. A further benefit of the truncated filter
             expansion is the improved deformation robustness of the
             equivariant representation, a property which is
             theoretically analyzed and empirically verified. Numerical
             experiments demonstrate that the proposed
             scaling-translation-equivariant network with decomposed
             convolutional filters (ScDCFNet) achieves significantly
             improved performance in multiscale image classification and
             better interpretability than regular CNNs at a reduced model
             size.},
   Key = {fds368018}
}

@article{fds370994,
   Author = {Zhu, S and Wang, H and Dong, Z and Cheng, X and Xie,
             Y},
   Title = {NEURAL SPECTRAL MARKED POINT PROCESSES},
   Journal = {ICLR 2022 - 10th International Conference on Learning
             Representations},
   Year = {2022},
   Month = {January},
   Abstract = {Self- and mutually-exciting point processes are popular
             models in machine learning and statistics for dependent
             discrete event data. To date, most existing models assume
             stationary kernels (including the classical Hawkes
             processes) and simple parametric models. Modern applications
             with complex event data require more general point process
             models that can incorporate contextual information of the
             events, called marks, besides the temporal and location
             information. Moreover, such applications often require
             non-stationary models to capture more complex
             spatio-temporal dependence. To tackle these challenges, a
             key question is to devise a versatile influence kernel in
             the point process model. In this paper, we introduce a novel
             and general neural network-based non-stationary influence
             kernel with high expressiveness for handling complex
             discrete events data while providing theoretical performance
             guarantees. We demonstrate the superior performance of our
             proposed method compared with the state-of-the-art on
             synthetic and real data.},
   Key = {fds370994}
}

@article{fds374297,
   Author = {Chen, Z and Li, Y and Cheng, X},
   Title = {SpecNet2: Orthogonalization-free Spectral Embedding by
             Neural Networks},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {190},
   Pages = {287-302},
   Year = {2022},
   Month = {January},
   Abstract = {Spectral methods which represent data points by eigenvectors
             of kernel matrices or graph Laplacian matrices have been a
             primary tool in unsupervised data analysis. In many
             application scenarios, parametrizing the spectral embedding
             by a neural network that can be trained over batches of data
             samples gives a promising way to achieve automatic
             out-of-sample extension as well as computational
             scalability. Such an approach was taken in the original
             paper of SpectralNet (Shaham et al. 2018), which we call
             SpecNet1. The current paper introduces a new neural network
             approach, named SpecNet2, to compute spectral embedding
             which optimizes an equivalent objective of the eigen-problem
             and removes the orthogonalization layer in SpecNet1.
             SpecNet2 also allows separating the sampling of rows and
             columns of the graph affinity matrix by tracking the
             neighbors of each data point through the gradient formula.
             Theoretically, we show that any local minimizer of the new
             orthogonalization-free objective reveals the leading
             eigenvectors. Furthermore, global convergence for this new
             orthogonalization-free objective using a batch-based
             gradient descent method is proved. Numerical experiments
             demonstrate the improved performance and computational
             efficiency of SpecNet2 on simulated data and image
             datasets.},
   Key = {fds374297}
}

@article{fds360266,
   Author = {Zhao, J and Jaffe, A and Li, H and Lindenbaum, O and Sefik, E and Jackson,
             R and Cheng, X and Flavell, RA and Kluger, Y},
   Title = {Detection of differentially abundant cell subpopulations in
             scRNA-seq data.},
   Journal = {Proceedings of the National Academy of Sciences of the
             United States of America},
   Volume = {118},
   Number = {22},
   Pages = {e2100293118},
   Year = {2021},
   Month = {June},
   url = {http://dx.doi.org/10.1073/pnas.2100293118},
   Abstract = {Comprehensive and accurate comparisons of transcriptomic
             distributions of cells from samples taken from two different
             biological states, such as healthy versus diseased
             individuals, are an emerging challenge in single-cell RNA
             sequencing (scRNA-seq) analysis. Current methods for
             detecting differentially abundant (DA) subpopulations
             between samples rely heavily on initial clustering of all
             cells in both samples. Often, this clustering step is
             inadequate since the DA subpopulations may not align with a
             clear cluster structure, and important differences between
             the two biological states can be missed. Here, we introduce
             DA-seq, a targeted approach for identifying DA
             subpopulations not restricted to clusters. DA-seq is a
             multiscale method that quantifies a local DA measure for
             each cell, which is computed from its <i>k</i> nearest
             neighboring cells across a range of <i>k</i> values. Based
             on this measure, DA-seq delineates contiguous significant DA
             subpopulations in the transcriptomic space. We apply DA-seq
             to several scRNA-seq datasets and highlight its improved
             ability to detect differences between distinct phenotypes in
             severe versus mildly ill COVID-19 patients, melanomas
             subjected to immune checkpoint therapy comparing responders
             to nonresponders, embryonic development at two time points,
             and young versus aging brain tissue. DA-seq enabled us to
             detect differences between these phenotypes. Importantly, we
             find that DA-seq not only recovers the DA cell types as
             discovered in the original studies but also reveals
             additional DA subpopulations that were not described before.
             Analysis of these subpopulations yields biological insights
             that would otherwise be undetected using conventional
             computational approaches.},
   Doi = {10.1073/pnas.2100293118},
   Key = {fds360266}
}

@article{fds360267,
   Author = {Zhang, Y and Cheng, X and Reeves, G},
   Title = {Convergence of Gaussian-smoothed optimal transport distance
             with sub-gamma distributions and dependent
             samples},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {130},
   Pages = {2422-2430},
   Year = {2021},
   Month = {January},
   Abstract = {The Gaussian-smoothed optimal transport (GOT) framework,
             recently proposed by Goldfeld et al., scales to high
             dimensions in estimation and provides an alternative to
             entropy regularization. This paper provides convergence
             guarantees for estimating the GOT distance under more
             general settings. For the Gaussian-smoothed p-Wasserstein
             distance in d dimensions, our results require only the
             existence of a moment greater than d + 2p. For the special
             case of sub-gamma distributions, we quantify the dependence
             on the dimension d and establish a phase transition with
             respect to the scale parameter. We also prove convergence
             for dependent samples, only requiring a condition on the
             pairwise dependence of the samples measured by the
             covariance of the feature map of a kernel space. A key step
             in our analysis is to show that the GOT distance is
             dominated by a family of kernel maximum mean discrepancy
             (MMD) distances with a kernel that depends on the cost
             function as well as the amount of Gaussian smoothing. This
             insight provides further interpretability for the GOT
             framework and also introduces a class of kernel MMD
             distances with desirable properties. The theoretical results
             are supported by numerical experiments.},
   Key = {fds360267}
}

@article{fds368020,
   Author = {Miao, Z and Wang, Z and Cheng, X and Qiu, Q},
   Title = {Spatiotemporal Joint Filter Decomposition in 3D
             Convolutional Neural Networks},
   Journal = {Advances in Neural Information Processing
             Systems},
   Volume = {5},
   Pages = {3376-3388},
   Year = {2021},
   Month = {January},
   ISBN = {9781713845393},
   Abstract = {In this paper, we introduce spatiotemporal joint filter
             decomposition to decouple spatial and temporal learning,
             while preserving spatiotemporal dependency in a video. A 3D
             convolutional filter is now jointly decomposed over a set of
             spatial and temporal filter atoms respectively. In this way,
             a 3D convolutional layer becomes three: a temporal atom
             layer, a spatial atom layer, and a joint coefficient layer,
             all three remaining convolutional. One obvious arithmetic
             manipulation allowed in our joint decomposition is to swap
             spatial or temporal atoms with a set of atoms that have the
             same number but different sizes, while keeping the remaining
             unchanged. For example, as shown later, we can now achieve
             tempo-invariance by simply dilating temporal atoms only. To
             illustrate this useful atom-swapping property, we further
             demonstrate how such a decomposition permits the direct
             learning of 3D CNNs with full-size videos through iterations
             of two consecutive sub-stages of learning: In the temporal
             stage, full-temporal downsampled-spatial data are used to
             learn temporal atoms and joint coefficients while fixing
             spatial atoms. In the spatial stage, full-spatial
             downsampled-temporal data are used for spatial atoms and
             joint coefficients while fixing temporal atoms. We show
             empirically on multiple action recognition datasets that,
             the decoupled spatiotemporal learning significantly reduces
             the model memory footprints, and allows deep 3D CNNs to
             model high-spatial long-temporal dependency with limited
             computational resources while delivering comparable
             performance.},
   Key = {fds368020}
}

@article{fds368021,
   Author = {Cheng, X and Xie, Y},
   Title = {Neural Tangent Kernel Maximum Mean Discrepancy},
   Journal = {Advances in Neural Information Processing
             Systems},
   Volume = {9},
   Pages = {6658-6670},
   Year = {2021},
   Month = {January},
   ISBN = {9781713845393},
   Abstract = {We present a novel neural network Maximum Mean Discrepancy
             (MMD) statistic by identifying a new connection between
             neural tangent kernel (NTK) and MMD. This connection enables
             us to develop a computationally efficient and
             memory-efficient approach to compute the MMD statistic and
             perform NTK based two-sample tests towards addressing the
             long-standing challenge of memory and computational
             complexity of the MMD statistic, which is essential for
             online implementation to assimilating new samples.
             Theoretically, such a connection allows us to understand the
             NTK test statistic properties, such as the Type-I error and
             testing power for performing the two-sample test, by
             adapting existing theories for kernel MMD. Numerical
             experiments on synthetic and real-world datasets validate
             the theory and demonstrate the effectiveness of the proposed
             NTK-MMD statistic.},
   Key = {fds368021}
}

@article{fds370995,
   Author = {Cheng, X and Miao, Z and Qiu, Q},
   Title = {Graph Convolution with Low-rank Learn-able Local
             Filters},
   Journal = {ICLR 2021 - 9th International Conference on Learning
             Representations},
   Year = {2021},
   Month = {January},
   Abstract = {Geometric variations like rotation, scaling, and viewpoint
             changes pose a significant challenge to visual
             understanding. One common solution is to directly model
             certain intrinsic structures, e.g., using landmarks.
             However, it then becomes non-trivial to build effective deep
             models, especially when the underlying non-Euclidean grid is
             irregular and coarse. Recent deep models using graph
             convolutions provide an appropriate framework to handle such
             non-Euclidean data, but many of them, particularly those
             based on global graph Laplacians, lack expressiveness to
             capture local features required for representation of
             signals lying on the non-Euclidean grid. The current paper
             introduces a new type of graph convolution with learnable
             low-rank local filters, which is provably more expressive
             than previous spectral graph convolution methods. The model
             also provides a unified framework for both spectral and
             spatial graph convolutions. To improve model robustness,
             regularization by local graph Laplacians is introduced. The
             representation stability against input graph data
             perturbation is theoretically proved, making use of the
             graph filter locality and the local graph regularization.
             Experiments on spherical mesh data, real-world facial
             expression recognition/skeleton-based action recognition
             data, and data with simulated graph noise show the empirical
             advantage of the proposed model.},
   Key = {fds370995}
}

@article{fds360268,
   Author = {Li, Y and Cheng, X and Lu, J},
   Title = {Butterfly-net: Optimal function representation based on
             convolutional neural networks},
   Journal = {Communications in Computational Physics},
   Volume = {28},
   Number = {5},
   Pages = {1838-1885},
   Year = {2020},
   Month = {November},
   url = {http://dx.doi.org/10.4208/CICP.OA-2020-0214},
   Abstract = {Deep networks, especially convolutional neural networks
             (CNNs), have been successfully applied in various areas of
             machine learning as well as to challenging problems in other
             scientific and engineering fields. This paper introduces
             Butterfly-net, a low-complexity CNN with structured and
             sparse cross-channel connections, together with a Butterfly
             initialization strategy for a family of networks.
             Theoretical analysis of the approximation power of
             Butterfly-net to the Fourier representation of input data
             shows that the error decays exponentially as the depth
             increases. Combining Butterfly-net with a fully connected
             neural network, a large class of problems are proved to be
             well approximated with network complexity depending on the
             effective frequency bandwidth instead of the input
             dimension. Regular CNN is covered as a special case in our
             analysis. Numerical experiments validate the analytical
             results on the approximation of Fourier kernels and energy
             functionals of Poisson's equations. Moreover, all
             experiments support that training from Butterfly
             initialization outperforms training from random
             initialization. Also, adding the remaining cross-channel
             connections, although significantly increases the parameter
             number, does not much improve the post-training accuracy and
             is more sensitive to data distribution.},
   Doi = {10.4208/CICP.OA-2020-0214},
   Key = {fds360268}
}

@article{fds360269,
   Author = {Mhaskar, HN and Cheng, X and Cloninger, A},
   Title = {A Witness Function Based Construction of Discriminative
             Models Using Hermite Polynomials},
   Journal = {Frontiers in Applied Mathematics and Statistics},
   Volume = {6},
   Year = {2020},
   Month = {August},
   url = {http://dx.doi.org/10.3389/fams.2020.00031},
   Abstract = {In machine learning, we are given a dataset of the form
             (Formula presented.), drawn as i.i.d. samples from an
             unknown probability distribution μ; the marginal
             distribution for the xj's being μ*, and the marginals of
             the kth class (Formula presented.) possibly overlapping. We
             address the problem of detecting, with a high degree of
             certainty, for which x we have (Formula presented.) for all
             i ≠ k. We propose that rather than using a positive kernel
             such as the Gaussian for estimation of these measures, using
             a non-positive kernel that preserves a large number of
             moments of these measures yields an optimal approximation.
             We use multi-variate Hermite polynomials for this purpose,
             and prove optimal and local approximation results in a
             supremum norm in a probabilistic sense. Together with a
             permutation test developed with the same kernel, we prove
             that the kernel estimator serves as a “witness function”
             in classification problems. Thus, if the value of this
             estimator at a point x exceeds a certain threshold, then the
             point is reliably in a certain class. This approach can be
             used to modify pretrained algorithms, such as neural
             networks or nonlinear dimension reduction techniques, to
             identify in-class vs out-of-class regions for the purposes
             of generative models, classification uncertainty, or finding
             robust centroids. This fact is demonstrated in a number of
             real world data sets including MNIST, CIFAR10, Science News
             documents, and LaLonde data sets.},
   Doi = {10.3389/fams.2020.00031},
   Key = {fds360269}
}

@article{fds360273,
   Author = {Wang, Z and Cheng, X and Sapiro, G and Qiu, Q},
   Title = {STOCHASTIC CONDITIONAL GENERATIVE NETWORKS WITH BASIS
             DECOMPOSITION},
   Journal = {8th International Conference on Learning Representations,
             ICLR 2020},
   Publisher = {OpenReview.net},
   Year = {2020},
   Month = {January},
   Abstract = {While generative adversarial networks (GANs) have
             revolutionized machine learning, a number of open questions
             remain to fully understand them and exploit their power. One
             of these questions is how to efficiently achieve proper
             diversity and sampling of the multi-mode data space. To
             address this, we introduce BasisGAN, a stochastic
             conditional multi-mode image generator. By exploiting the
             observation that a convolutional filter can be well
             approximated as a linear combination of a small set of basis
             elements, we learn a plug-and-played basis generator to
             stochastically generate basis elements, with just a few
             hundred of parameters, to fully embed stochasticity into
             convolutional filters. By sampling basis elements instead of
             filters, we dramatically reduce the cost of modeling the
             parameter space with no sacrifice on either image diversity
             or fidelity. To illustrate this proposed plug-and-play
             framework, we construct variants of BasisGAN based on
             state-of-the-art conditional image generation networks, and
             train the networks by simply plugging in a basis generator,
             without additional auxiliary components, hyperparameters, or
             training objectives. The experimental success is
             complemented with theoretical results indicating how the
             perturbations introduced by the proposed sampling of basis
             elements can propagate to the appearance of generated
             images.},
   Key = {fds360273}
}

@article{fds360271,
   Author = {Li, H and Lindenbaum, O and Cheng, X and Cloninger,
             A},
   Title = {Variational Diffusion Autoencoders with Random Walk
             Sampling},
   Journal = {Lecture Notes in Computer Science (including subseries
             Lecture Notes in Artificial Intelligence and Lecture Notes
             in Bioinformatics)},
   Volume = {12368 LNCS},
   Pages = {362-378},
   Year = {2020},
   Month = {January},
   ISBN = {9783030585914},
   url = {http://dx.doi.org/10.1007/978-3-030-58592-1_22},
   Abstract = {Variational autoencoders (VAEs) and generative adversarial
             networks (GANs) enjoy an intuitive connection to manifold
             learning: in training the decoder/generator is optimized to
             approximate a homeomorphism between the data distribution
             and the sampling space. This is a construction that strives
             to define the data manifold. A major obstacle to VAEs and
             GANs, however, is choosing a suitable prior that matches the
             data topology. Well-known consequences of poorly picked
             priors are posterior and mode collapse. To our knowledge, no
             existing method sidesteps this user choice. Conversely,
             diffusion maps automatically infer the data topology and
             enjoy a rigorous connection to manifold learning, but do not
             scale easily or provide the inverse homeomorphism (i.e.
             decoder/generator). We propose a method (https://github.com/lihenryhfl/vdae)
             that combines these approaches into a generative model that
             inherits the asymptotic guarantees of diffusion maps while
             preserving the scalability of deep models. We prove
             approximation theoretic results for the dimension dependence
             of our proposed method. Finally, we demonstrate the
             effectiveness of our method with various real and synthetic
             datasets.},
   Doi = {10.1007/978-3-030-58592-1_22},
   Key = {fds360271}
}

@article{fds360272,
   Author = {Cheng, X and Mishne, G},
   Title = {Spectral Embedding Norm: Looking Deep into the Spectrum of
             the Graph Laplacian.},
   Journal = {SIAM journal on imaging sciences},
   Volume = {13},
   Number = {2},
   Pages = {1015-1048},
   Year = {2020},
   Month = {January},
   url = {http://dx.doi.org/10.1137/18m1283160},
   Abstract = {The extraction of clusters from a dataset which includes
             multiple clusters and a significant background component is
             a non-trivial task of practical importance. In image
             analysis this manifests for example in anomaly detection and
             target detection. The traditional spectral clustering
             algorithm, which relies on the leading <i>K</i> eigenvectors
             to detect <i>K</i> clusters, fails in such cases. In this
             paper we propose the <i>spectral embedding norm</i> which
             sums the squared values of the first <i>I</i> normalized
             eigenvectors, where <i>I</i> can be significantly larger
             than <i>K</i>. We prove that this quantity can be used to
             separate clusters from the background in unbalanced
             settings, including extreme cases such as outlier detection.
             The performance of the algorithm is not sensitive to the
             choice of <i>I</i>, and we demonstrate its application on
             synthetic and real-world remote sensing and neuroimaging
             datasets.},
   Doi = {10.1137/18m1283160},
   Key = {fds360272}
}

@article{fds374298,
   Author = {Alaifari, R and Cheng, X and Pierce, LB and Steinerberger,
             S},
   Title = {On matrix rearrangement inequalities},
   Journal = {Proceedings of the American Mathematical
             Society},
   Volume = {148},
   Number = {5},
   Pages = {1835-1848},
   Year = {2020},
   Month = {January},
   url = {http://dx.doi.org/10.1090/proc/14831},
   Abstract = {Given two symmetric and positive semidefinite square
             matrices A,B, is it true that any matrix given as the
             product of m copies of A and n copies of B in a particular
             sequence must be dominated in the spectral norm by the
             ordered matrix product AmBn? For example, is ∥ AABAABABB
             ∥ ≤ ∥AAAAABBBB∥? Drury [Electron J. Linear Algebra
             18 (2009), pp. 13 20] has characterized precisely which
             disordered words have the property that an inequality of
             this type holds for all matrices A,B. However, the
             1-parameter family of counterexamples Drury constructs for
             these characterizations is comprised of 3×3 matrices, and
             thus as stated the characterization applies only for N × N
             matrices with N ≤ 3. In contrast, we prove that for 2 × 2
             matrices, the general rearrangement inequality holds for all
             disordered words. We also show that for larger N ×N
             matrices, the general rearrangement inequality holds for all
             disordered words for most A,B (in a sense of full measure)
             that are sufficiently small perturbations of the
             identity.},
   Doi = {10.1090/proc/14831},
   Key = {fds374298}
}

@article{fds374299,
   Author = {Xu, Z and Li, Y and Cheng, X},
   Title = {Butterfly-Net2: Simplified Butterfly-Net and Fourier
             Transform Initialization},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {107},
   Pages = {431-450},
   Year = {2020},
   Month = {January},
   Abstract = {Structured CNN designed using the prior information of
             problems potentially improves efficiency over conventional
             CNNs in various tasks in solving PDEs and inverse problems
             in signal processing. This paper introduces BNet2, a
             simplified Butterfly-Net and inline with the conventional
             CNN. Moreover, a Fourier transform initialization is
             proposed for both BNet2 and CNN with guaranteed
             approximation power to represent the Fourier transform
             operator. Experimentally, BNet2 and the Fourier transform
             initialization strategy are tested on various tasks,
             including approximating Fourier transform operator,
             end-to-end solvers of linear and nonlinear PDEs, and
             denoising and deblurring of 1D signals. On all tasks, under
             the same initialization, BNet2 achieves similar accuracy as
             CNN but has fewer parameters. And Fourier transform
             initialized BNet2 and CNN consistently improve the training
             and testing accuracy over the randomly initialized
             CNN.},
   Key = {fds374299}
}

@article{fds347406,
   Author = {Cheng, X and Cloninger, A and Coifman, RR},
   Title = {Two-sample statistics based on anisotropic
             kernels},
   Journal = {Information and Inference: A Journal of the
             IMA},
   Publisher = {Oxford University Press (OUP)},
   Year = {2019},
   Month = {December},
   url = {http://dx.doi.org/10.1093/imaiai/iaz018},
   Abstract = {<jats:title>Abstract</jats:title> <jats:p>The paper
             introduces a new kernel-based Maximum Mean Discrepancy (MMD)
             statistic for measuring the distance between two
             distributions given finitely many multivariate samples. When
             the distributions are locally low-dimensional, the proposed
             test can be made more powerful to distinguish certain
             alternatives by incorporating local covariance matrices and
             constructing an anisotropic kernel. The kernel matrix is
             asymmetric; it computes the affinity between $n$ data points
             and a set of $n_R$ reference points, where $n_R$ can be
             drastically smaller than $n$. While the proposed statistic
             can be viewed as a special class of Reproducing Kernel
             Hilbert Space MMD, the consistency of the test is proved,
             under mild assumptions of the kernel, as long as $\|p-q\|
             \sqrt{n} \to \infty $, and a finite-sample lower bound of
             the testing power is obtained. Applications to flow
             cytometry and diffusion MRI datasets are demonstrated, which
             motivate the proposed approach to compare
             distributions.</jats:p>},
   Doi = {10.1093/imaiai/iaz018},
   Key = {fds347406}
}

@article{fds347407,
   Author = {Cheng, X and Qiu, Q and Calderbank, R and Sapiro,
             G},
   Title = {RotDCF: Decomposition of convolutional filters for
             rotation-equivariant deep networks},
   Year = {2019},
   Month = {May},
   Abstract = {Explicit encoding of group actions in deep features makes it
             possible for convolutional neural networks (CNNs) to handle
             global deformations of images, which is critical to success
             in many vision tasks. This paper proposes to decompose the
             convolutional filters over joint steerable bases across the
             space and the group geometry simultaneously, namely a
             rotation-equivariant CNN with decomposed convolutional
             filters (RotDCF). This decomposition facilitates computing
             the joint convolution, which is proved to be necessary for
             the group equivariance. It significantly reduces the model
             size and computational complexity while preserving
             performance, and truncation of the bases expansion serves
             implicitly to regularize the filters. On datasets involving
             in-plane and out-of-plane object rotations, RotDCF deep
             features demonstrate greater robustness and interpretability
             than regular CNNs. The stability of the equivariant
             representation to input variations is also proved
             theoretically. The RotDCF framework can be extended to
             groups other than rotations, providing a general approach
             which achieves both group equivariance and representation
             stability at a reduced model size.},
   Key = {fds347407}
}

@article{fds339535,
   Author = {Cheng, X and Rachh, M and Steinerberger, S},
   Title = {On the diffusion geometry of graph Laplacians and
             applications},
   Journal = {Applied and Computational Harmonic Analysis},
   Volume = {46},
   Number = {3},
   Pages = {674-688},
   Publisher = {Elsevier BV},
   Year = {2019},
   Month = {May},
   url = {http://dx.doi.org/10.1016/j.acha.2018.04.001},
   Doi = {10.1016/j.acha.2018.04.001},
   Key = {fds339535}
}

@article{fds344623,
   Author = {Yan, B and Sarkar, P and Cheng, X},
   Title = {Provable estimation of the number of blocks in block
             models},
   Journal = {Proceedings of the Twenty-First International Conference on
             Artificial Intelligence and Statistics (AISTATS'18)},
   Volume = {84},
   Pages = {1185-1194},
   Publisher = {PMLR},
   Year = {2018},
   Month = {April},
   Abstract = {Community detection is a fundamental unsupervised learning
             problem for unlabeled networks which has a broad range of
             applications. Many community detection algorithms assume
             that the number of clusters r is known apriori. In this
             paper, we propose an approach based on semi-definite
             relaxations, which does not require prior knowledge of model
             parameters like many existing convex relaxation methods and
             recovers the number of clusters and the clustering matrix
             exactly under a broad parameter regime, with probability
             tending to one. On a variety of simulated and real data
             experiments, we show that the proposed method often
             outperforms state-of-the-art techniques for estimating the
             number of clusters.},
   Key = {fds344623}
}

@article{fds330801,
   Author = {Cheng, X and Mishne, G and Steinerberger, S},
   Title = {The geometry of nodal sets and outlier detection},
   Journal = {Journal of Number Theory},
   Volume = {185},
   Pages = {48-64},
   Publisher = {Elsevier BV},
   Year = {2018},
   Month = {April},
   url = {http://dx.doi.org/10.1016/j.jnt.2017.09.021},
   Doi = {10.1016/j.jnt.2017.09.021},
   Key = {fds330801}
}

@article{fds339536,
   Author = {Qiu, Q and Cheng, X and Calderbank, AR and Sapiro,
             G},
   Title = {DCFNet: Deep Neural Network with Decomposed Convolutional
             Filters.},
   Journal = {ICML},
   Volume = {80},
   Pages = {4195-4204},
   Publisher = {PMLR},
   Editor = {Dy, JG and Krause, A},
   Year = {2018},
   Key = {fds339536}
}

@article{fds330800,
   Author = {Lu, J and Lu, Y and Wang, X and Li, X and Linderman, GC and Wu, C and Cheng,
             X and Mu, L and Zhang, H and Liu, J and Su, M and Zhao, H and Spatz, ES and Spertus, JA and Masoudi, FA and Krumholz, HM and Jiang,
             L},
   Title = {Prevalence, awareness, treatment, and control of
             hypertension in China: data from 1·7 million adults in a
             population-based screening study (China PEACE Million
             Persons Project)},
   Journal = {The Lancet},
   Volume = {390},
   Number = {10112},
   Pages = {2549-2558},
   Publisher = {Elsevier BV},
   Year = {2017},
   Month = {December},
   url = {http://dx.doi.org/10.1016/s0140-6736(17)32478-9},
   Doi = {10.1016/s0140-6736(17)32478-9},
   Key = {fds330800}
}

@article{fds330802,
   Author = {Pragier, G and Greenberg, I and Cheng, X and Shkolnisky,
             Y},
   Title = {A Graph Partitioning Approach to Simultaneous Angular
             Reconstitution},
   Journal = {IEEE Transactions on Computational Imaging},
   Volume = {2},
   Number = {3},
   Pages = {323-334},
   Publisher = {Institute of Electrical and Electronics Engineers
             (IEEE)},
   Year = {2016},
   Month = {September},
   url = {http://dx.doi.org/10.1109/tci.2016.2557076},
   Doi = {10.1109/tci.2016.2557076},
   Key = {fds330802}
}

@article{fds330803,
   Author = {Zhang, T and Cheng, X and Singer, A},
   Title = {Marčenko–Pastur law for Tyler’s M-estimator},
   Journal = {Journal of Multivariate Analysis},
   Volume = {149},
   Pages = {114-123},
   Publisher = {Elsevier BV},
   Year = {2016},
   Month = {July},
   url = {http://dx.doi.org/10.1016/j.jmva.2016.03.010},
   Doi = {10.1016/j.jmva.2016.03.010},
   Key = {fds330803}
}

@article{fds330805,
   Author = {Cheng, X and Shaham, U and Dror, O and Jaffe, A and Nadler, B and Chang, J and Kluger, Y},
   Title = {A Deep Learning Approach to Unsupervised Ensemble
             Learning},
   Journal = {Proceedings of The 33rd International Conference on Machine
             Learning},
   Volume = {48},
   Pages = {30-39},
   Publisher = {PMLR},
   Year = {2016},
   Month = {June},
   Key = {fds330805}
}

@article{fds330804,
   Author = {Cheng, X and Chen, X and Mallat, S},
   Title = {Deep Haar scattering networks},
   Journal = {Information and Inference},
   Volume = {5},
   Number = {2},
   Pages = {105-133},
   Publisher = {Oxford University Press (OUP)},
   Year = {2016},
   Month = {June},
   url = {http://dx.doi.org/10.1093/imaiai/iaw007},
   Doi = {10.1093/imaiai/iaw007},
   Key = {fds330804}
}

@article{fds330806,
   Author = {Boumal, N and Cheng, X},
   Title = {Concentration of the Kirchhoff index for
             Erdős–Rényi graphs},
   Journal = {Systems & Control Letters},
   Volume = {74},
   Pages = {74-80},
   Publisher = {Elsevier BV},
   Year = {2014},
   Month = {December},
   url = {http://dx.doi.org/10.1016/j.sysconle.2014.10.006},
   Doi = {10.1016/j.sysconle.2014.10.006},
   Key = {fds330806}
}

@article{fds330807,
   Author = {Chen, X and Cheng, X and Mallat, S},
   Title = {Unsupervised Deep Haar Scattering on Graphs.},
   Journal = {Advances in Neural Information Processing Systems
             27},
   Pages = {1709-1717},
   Editor = {Ghahramani, Z and Welling, M and Cortes, C and Lawrence, ND and Weinberger, KQ},
   Year = {2014},
   Key = {fds330807}
}

@article{fds330808,
   Author = {CHENG, XIUYUAN and SINGER, AMIT},
   Title = {The Spectrum of Random Inner-product Kernel
             Matrices},
   Journal = {Random Matrices: Theory and Applications},
   Volume = {02},
   Number = {04},
   Pages = {1350010-1350010},
   Publisher = {World Scientific Pub Co Pte Lt},
   Year = {2013},
   Month = {October},
   url = {http://dx.doi.org/10.1142/S201032631350010X},
   Doi = {10.1142/S201032631350010X},
   Key = {fds330808}
}

@article{fds330809,
   Author = {E, W and Zhou, X and Cheng, X},
   Title = {Subcritical bifurcation in spatially extended
             systems},
   Journal = {Nonlinearity},
   Volume = {25},
   Number = {3},
   Pages = {761-779},
   Publisher = {IOP Publishing},
   Year = {2012},
   Month = {March},
   url = {http://dx.doi.org/10.1088/0951-7715/25/3/761},
   Doi = {10.1088/0951-7715/25/3/761},
   Key = {fds330809}
}

@article{fds330810,
   Author = {Cheng, X and Lin, L and E, W and Zhang, P and Shi, A-C},
   Title = {Nucleation of Ordered Phases in Block Copolymers},
   Journal = {Physical Review Letters},
   Volume = {104},
   Number = {14},
   Publisher = {American Physical Society (APS)},
   Year = {2010},
   Month = {April},
   url = {http://dx.doi.org/10.1103/physrevlett.104.148301},
   Doi = {10.1103/physrevlett.104.148301},
   Key = {fds330810}
}

@article{fds330811,
   Author = {Lin, L and Cheng, X and E, W and Shi, A-C and Zhang,
             P},
   Title = {A numerical method for the study of nucleation of ordered
             phases},
   Journal = {Journal of Computational Physics},
   Volume = {229},
   Number = {5},
   Pages = {1797-1809},
   Publisher = {Elsevier BV},
   Year = {2010},
   Month = {March},
   url = {http://dx.doi.org/10.1016/j.jcp.2009.11.009},
   Doi = {10.1016/j.jcp.2009.11.009},
   Key = {fds330811}
}

 

dept@math.duke.edu
ph: 919.660.2800
fax: 919.660.2821

Mathematics Department
Duke University, Box 90320
Durham, NC 27708-0320