%% Papers Published
@article{fds374290,
Author = {Repasky, M and Cheng, X and Xie, Y},
Title = {Neural Stein Critics with Staged L2-Regularization},
Journal = {IEEE Transactions on Information Theory},
Volume = {69},
Number = {11},
Pages = {7246-7275},
Year = {2023},
Month = {November},
url = {http://dx.doi.org/10.1109/TIT.2023.3299258},
Abstract = {Learning to differentiate model distributions from observed
data is a fundamental problem in statistics and machine
learning, and high-dimensional data remains a challenging
setting for such problems. Metrics that quantify the
disparity in probability distributions, such as the Stein
discrepancy, play an important role in high-dimensional
statistical testing. In this paper, we investigate the role
of L2 regularization in training a neural network Stein
critic so as to distinguish between data sampled from an
unknown probability distribution and a nominal model
distribution. Making a connection to the Neural Tangent
Kernel (NTK) theory, we develop a novel staging procedure
for the weight of regularization over training time, which
leverages the advantages of highly-regularized training at
early times. Theoretically, we prove the approximation of
the training dynamic by the kernel optimization, namely the
'lazy training', when the L2 regularization weight is large,
and training on n samples converge at a rate of O(n-1/2) up
to a log factor. The result guarantees learning the optimal
critic assuming sufficient alignment with the leading
eigen-modes of the zero-time NTK. The benefit of the staged
L2 regularization is demonstrated on simulated high
dimensional data and an application to evaluating generative
models of image data.},
Doi = {10.1109/TIT.2023.3299258},
Key = {fds374290}
}
@article{fds374292,
Author = {Landa, B and Cheng, X},
Title = {Robust Inference of Manifold Density and Geometry by Doubly
Stochastic Scaling},
Journal = {SIAM Journal on Mathematics of Data Science},
Volume = {5},
Number = {3},
Pages = {589-614},
Publisher = {Society for Industrial & Applied Mathematics
(SIAM)},
Year = {2023},
Month = {September},
url = {http://dx.doi.org/10.1137/22m1516968},
Doi = {10.1137/22m1516968},
Key = {fds374292}
}
@article{fds374293,
Author = {Lee, J and Xie, Y and Cheng, X},
Title = {Training Neural Networks for Sequential Change-Point
Detection},
Journal = {ICASSP, IEEE International Conference on Acoustics, Speech
and Signal Processing - Proceedings},
Volume = {2023-June},
Year = {2023},
Month = {January},
ISBN = {9781728163277},
url = {http://dx.doi.org/10.1109/ICASSP49357.2023.10095005},
Abstract = {Detecting an abrupt distributional shift of a data stream,
known as change-point detection, is a fundamental problem in
statistics and machine learning. We introduce a novel
approach for online change-point detection using neural
net-works. To be specific, our approach is training neural
net-works to compute the cumulative sum of a detection
statistic sequentially, which exhibits a significant change
when a change-point occurs. We demonstrated the superiority
and potential of the proposed method in detecting
change-point using both synthetic and real-world
data.1},
Doi = {10.1109/ICASSP49357.2023.10095005},
Key = {fds374293}
}
@article{fds368016,
Author = {Cheng, X and Wu, N},
Title = {Eigen-convergence of Gaussian kernelized graph Laplacian by
manifold heat interpolation},
Journal = {Applied and Computational Harmonic Analysis},
Volume = {61},
Pages = {132-190},
Year = {2022},
Month = {November},
url = {http://dx.doi.org/10.1016/j.acha.2022.06.003},
Abstract = {We study the spectral convergence of graph Laplacians to the
Laplace-Beltrami operator when the kernelized graph affinity
matrix is constructed from N random samples on a
d-dimensional manifold in an ambient Euclidean space. By
analyzing Dirichlet form convergence and constructing
candidate approximate eigenfunctions via convolution with
manifold heat kernel, we prove eigen-convergence with rates
as N increases. The best eigenvalue convergence rate is
N−1/(d/2+2) (when the kernel bandwidth parameter
ϵ∼(logN/N)1/(d/2+2)) and the best eigenvector 2-norm
convergence rate is N−1/(d/2+3) (when ϵ∼(logN/N)1/(d/2+3)).
These rates hold up to a logN-factor for finitely many
low-lying eigenvalues of both un-normalized and normalized
graph Laplacians. When data density is non-uniform, we prove
the same rates for the density-corrected graph Laplacian,
and we also establish new operator point-wise convergence
rate and Dirichlet form convergence rate as intermediate
results. Numerical results are provided to support the
theory.},
Doi = {10.1016/j.acha.2022.06.003},
Key = {fds368016}
}
@article{fds367221,
Author = {Tan, Y and Zhang, Y and Cheng, X and Zhou, X-H},
Title = {Statistical inference using GLEaM model with spatial
heterogeneity and correlation between regions.},
Journal = {Scientific reports},
Volume = {12},
Number = {1},
Pages = {16630},
Year = {2022},
Month = {October},
url = {http://dx.doi.org/10.1038/s41598-022-18775-8},
Abstract = {A better understanding of various patterns in the
coronavirus disease 2019 (COVID-19) spread in different
parts of the world is crucial to its prevention and control.
Motivated by the previously developed Global Epidemic and
Mobility (GLEaM) model, this paper proposes a new stochastic
dynamic model to depict the evolution of COVID-19. The model
allows spatial and temporal heterogeneity of transmission
parameters and involves transportation between regions.
Based on the proposed model, this paper also designs a
two-step procedure for parameter inference, which utilizes
the correlation between regions through a prior distribution
that imposes graph Laplacian regularization on transmission
parameters. Experiments on simulated data and real-world
data in China and Europe indicate that the proposed model
achieves higher accuracy in predicting the newly confirmed
cases than baseline models.},
Doi = {10.1038/s41598-022-18775-8},
Key = {fds367221}
}
@article{fds368019,
Author = {Cheng, X and Cloninger, A},
Title = {Classification logit two-sample testing by neural networks
for differentiating near manifold densities.},
Journal = {IEEE transactions on information theory},
Volume = {68},
Number = {10},
Pages = {6631-6662},
Year = {2022},
Month = {October},
url = {http://dx.doi.org/10.1109/tit.2022.3175691},
Abstract = {The recent success of generative adversarial networks and
variational learning suggests that training a classification
network may work well in addressing the classical two-sample
problem, which asks to differentiate two densities given
finite samples from each one. Network-based methods have the
computational advantage that the algorithm scales to large
datasets. This paper considers using the classification
logit function, which is provided by a trained
classification neural network and evaluated on the testing
set split of the two datasets, to compute a two-sample
statistic. To analyze the approximation and estimation error
of the logit function to differentiate near-manifold
densities, we introduce a new result of near-manifold
integral approximation by neural networks. We then show that
the logit function provably differentiates two
sub-exponential densities given that the network is
sufficiently parametrized, and for on or near manifold
densities, the needed network complexity is reduced to only
scale with the intrinsic dimensionality. In experiments, the
network logit test demonstrates better performance than
previous network-based tests using classification accuracy,
and also compares favorably to certain kernel maximum mean
discrepancy tests on synthetic datasets and hand-written
digit datasets.},
Doi = {10.1109/tit.2022.3175691},
Key = {fds368019}
}
@article{fds368017,
Author = {Cheng, X and Wu, H-T},
Title = {Convergence of graph Laplacian with kNN self-tuned
kernels},
Journal = {Information and Inference: A Journal of the
IMA},
Volume = {11},
Number = {3},
Pages = {889-957},
Publisher = {Oxford University Press (OUP)},
Year = {2022},
Month = {September},
url = {http://dx.doi.org/10.1093/imaiai/iaab019},
Abstract = {<jats:title>Abstract</jats:title> <jats:p>Kernelized Gram
matrix $W$ constructed from data points $\{x_i\}_{i=1}^N$ as
$W_{ij}= k_0( \frac{ \| x_i - x_j \|^2} {\sigma ^2} ) $ is
widely used in graph-based geometric data analysis and
unsupervised learning. An important question is how to
choose the kernel bandwidth $\sigma $, and a common practice
called self-tuned kernel adaptively sets a $\sigma _i$ at
each point $x_i$ by the $k$-nearest neighbor (kNN) distance.
When $x_i$s are sampled from a $d$-dimensional manifold
embedded in a possibly high-dimensional space, unlike with
fixed-bandwidth kernels, theoretical results of graph
Laplacian convergence with self-tuned kernels have been
incomplete. This paper proves the convergence of graph
Laplacian operator $L_N$ to manifold (weighted-)Laplacian
for a new family of kNN self-tuned kernels $W^{(\alpha
)}_{ij} = k_0( \frac{ \| x_i - x_j \|^2}{ \epsilon \hat{\rho
}(x_i) \hat{\rho }(x_j)})/\hat{\rho }(x_i)^\alpha \hat{\rho
}(x_j)^\alpha $, where $\hat{\rho }$ is the estimated
bandwidth function by kNN and the limiting operator is also
parametrized by $\alpha $. When $\alpha = 1$, the limiting
operator is the weighted manifold Laplacian $\varDelta _p$.
Specifically, we prove the point-wise convergence of $L_N f
$ and convergence of the graph Dirichlet form with rates.
Our analysis is based on first establishing a $C^0$
consistency for $\hat{\rho }$ which bounds the relative
estimation error $|\hat{\rho } - \bar{\rho }|/\bar{\rho }$
uniformly with high probability, where $\bar{\rho } =
p^{-1/d}$ and $p$ is the data density function. Our
theoretical results reveal the advantage of the self-tuned
kernel over the fixed-bandwidth kernel via smaller variance
error in low-density regions. In the algorithm, no prior
knowledge of $d$ or data density is needed. The theoretical
results are supported by numerical experiments on simulated
data and hand-written digit image data.</jats:p>},
Doi = {10.1093/imaiai/iaab019},
Key = {fds368017}
}
@article{fds374296,
Author = {Xu, C and Cheng, X and Xie, Y},
Title = {Invertible Neural Networks for Graph Prediction},
Journal = {IEEE Journal on Selected Areas in Information
Theory},
Volume = {3},
Number = {3},
Pages = {454-467},
Publisher = {Institute of Electrical and Electronics Engineers
(IEEE)},
Year = {2022},
Month = {September},
url = {http://dx.doi.org/10.1109/JSAIT.2022.3221864},
Abstract = {Graph prediction problems prevail in data analysis and
machine learning. The inverse prediction problem, namely to
infer input data from given output labels, is of emerging
interest in various applications. In this work, we develop
invertible graph neural network (iGNN), a deep generative
model to tackle the inverse prediction problem on graphs by
casting it as a conditional generative task. The proposed
model consists of an invertible sub-network that maps
one-to-one from data to an intermediate encoded feature,
which allows forward prediction by a linear classification
sub-network as well as efficient generation from output
labels via a parametric mixture model. The invertibility of
the encoding sub-network is ensured by a Wasserstein-2
regularization which allows free-form layers in the residual
blocks. The model is scalable to large graphs by a
factorized parametric mixture model of the encoded feature
and is computationally scalable by using GNN layers. The
existence of invertible flow mapping is backed by theories
of optimal transport and diffusion process, and we prove the
expressiveness of graph convolution layers to approximate
the theoretical flows of graph data. The proposed iGNN model
is experimentally examined on synthetic data, including the
example on large graphs, and the empirical advantage is also
demonstrated on real-application datasets of solar ramping
event data and traffic flow anomaly detection.},
Doi = {10.1109/JSAIT.2022.3221864},
Key = {fds374296}
}
@article{fds368018,
Author = {Zhu, W and Qiu, Q and Calderbank, R and Sapiro, G and Cheng,
X},
Title = {Scaling-Translation-Equivariant Networks with Decomposed
Convolutional Filters},
Journal = {Journal of Machine Learning Research},
Volume = {23},
Year = {2022},
Month = {January},
Abstract = {Encoding the scale information explicitly into the
representation learned by a convolutional neural network
(CNN) is beneficial for many computer vision tasks
especially when dealing with multiscale inputs. We study, in
this paper, a scaling-translation-equivariant (ST
-equivariant) CNN with joint convolutions across the space
and the scaling group, which is shown to be both sufficient
and necessary to achieve equivariance for the regular
representation of the scaling-translation group ST . To
reduce the model complexity and computational burden, we
decompose the convolutional filters under two pre-fixed
separable bases and truncate the expansion to low-frequency
components. A further benefit of the truncated filter
expansion is the improved deformation robustness of the
equivariant representation, a property which is
theoretically analyzed and empirically verified. Numerical
experiments demonstrate that the proposed
scaling-translation-equivariant network with decomposed
convolutional filters (ScDCFNet) achieves significantly
improved performance in multiscale image classification and
better interpretability than regular CNNs at a reduced model
size.},
Key = {fds368018}
}
@article{fds370994,
Author = {Zhu, S and Wang, H and Dong, Z and Cheng, X and Xie,
Y},
Title = {NEURAL SPECTRAL MARKED POINT PROCESSES},
Journal = {ICLR 2022 - 10th International Conference on Learning
Representations},
Year = {2022},
Month = {January},
Abstract = {Self- and mutually-exciting point processes are popular
models in machine learning and statistics for dependent
discrete event data. To date, most existing models assume
stationary kernels (including the classical Hawkes
processes) and simple parametric models. Modern applications
with complex event data require more general point process
models that can incorporate contextual information of the
events, called marks, besides the temporal and location
information. Moreover, such applications often require
non-stationary models to capture more complex
spatio-temporal dependence. To tackle these challenges, a
key question is to devise a versatile influence kernel in
the point process model. In this paper, we introduce a novel
and general neural network-based non-stationary influence
kernel with high expressiveness for handling complex
discrete events data while providing theoretical performance
guarantees. We demonstrate the superior performance of our
proposed method compared with the state-of-the-art on
synthetic and real data.},
Key = {fds370994}
}
@article{fds374297,
Author = {Chen, Z and Li, Y and Cheng, X},
Title = {SpecNet2: Orthogonalization-free Spectral Embedding by
Neural Networks},
Journal = {Proceedings of Machine Learning Research},
Volume = {190},
Pages = {287-302},
Year = {2022},
Month = {January},
Abstract = {Spectral methods which represent data points by eigenvectors
of kernel matrices or graph Laplacian matrices have been a
primary tool in unsupervised data analysis. In many
application scenarios, parametrizing the spectral embedding
by a neural network that can be trained over batches of data
samples gives a promising way to achieve automatic
out-of-sample extension as well as computational
scalability. Such an approach was taken in the original
paper of SpectralNet (Shaham et al. 2018), which we call
SpecNet1. The current paper introduces a new neural network
approach, named SpecNet2, to compute spectral embedding
which optimizes an equivalent objective of the eigen-problem
and removes the orthogonalization layer in SpecNet1.
SpecNet2 also allows separating the sampling of rows and
columns of the graph affinity matrix by tracking the
neighbors of each data point through the gradient formula.
Theoretically, we show that any local minimizer of the new
orthogonalization-free objective reveals the leading
eigenvectors. Furthermore, global convergence for this new
orthogonalization-free objective using a batch-based
gradient descent method is proved. Numerical experiments
demonstrate the improved performance and computational
efficiency of SpecNet2 on simulated data and image
datasets.},
Key = {fds374297}
}
@article{fds360266,
Author = {Zhao, J and Jaffe, A and Li, H and Lindenbaum, O and Sefik, E and Jackson,
R and Cheng, X and Flavell, RA and Kluger, Y},
Title = {Detection of differentially abundant cell subpopulations in
scRNA-seq data.},
Journal = {Proceedings of the National Academy of Sciences of the
United States of America},
Volume = {118},
Number = {22},
Pages = {e2100293118},
Year = {2021},
Month = {June},
url = {http://dx.doi.org/10.1073/pnas.2100293118},
Abstract = {Comprehensive and accurate comparisons of transcriptomic
distributions of cells from samples taken from two different
biological states, such as healthy versus diseased
individuals, are an emerging challenge in single-cell RNA
sequencing (scRNA-seq) analysis. Current methods for
detecting differentially abundant (DA) subpopulations
between samples rely heavily on initial clustering of all
cells in both samples. Often, this clustering step is
inadequate since the DA subpopulations may not align with a
clear cluster structure, and important differences between
the two biological states can be missed. Here, we introduce
DA-seq, a targeted approach for identifying DA
subpopulations not restricted to clusters. DA-seq is a
multiscale method that quantifies a local DA measure for
each cell, which is computed from its <i>k</i> nearest
neighboring cells across a range of <i>k</i> values. Based
on this measure, DA-seq delineates contiguous significant DA
subpopulations in the transcriptomic space. We apply DA-seq
to several scRNA-seq datasets and highlight its improved
ability to detect differences between distinct phenotypes in
severe versus mildly ill COVID-19 patients, melanomas
subjected to immune checkpoint therapy comparing responders
to nonresponders, embryonic development at two time points,
and young versus aging brain tissue. DA-seq enabled us to
detect differences between these phenotypes. Importantly, we
find that DA-seq not only recovers the DA cell types as
discovered in the original studies but also reveals
additional DA subpopulations that were not described before.
Analysis of these subpopulations yields biological insights
that would otherwise be undetected using conventional
computational approaches.},
Doi = {10.1073/pnas.2100293118},
Key = {fds360266}
}
@article{fds360267,
Author = {Zhang, Y and Cheng, X and Reeves, G},
Title = {Convergence of Gaussian-smoothed optimal transport distance
with sub-gamma distributions and dependent
samples},
Journal = {Proceedings of Machine Learning Research},
Volume = {130},
Pages = {2422-2430},
Year = {2021},
Month = {January},
Abstract = {The Gaussian-smoothed optimal transport (GOT) framework,
recently proposed by Goldfeld et al., scales to high
dimensions in estimation and provides an alternative to
entropy regularization. This paper provides convergence
guarantees for estimating the GOT distance under more
general settings. For the Gaussian-smoothed p-Wasserstein
distance in d dimensions, our results require only the
existence of a moment greater than d + 2p. For the special
case of sub-gamma distributions, we quantify the dependence
on the dimension d and establish a phase transition with
respect to the scale parameter. We also prove convergence
for dependent samples, only requiring a condition on the
pairwise dependence of the samples measured by the
covariance of the feature map of a kernel space. A key step
in our analysis is to show that the GOT distance is
dominated by a family of kernel maximum mean discrepancy
(MMD) distances with a kernel that depends on the cost
function as well as the amount of Gaussian smoothing. This
insight provides further interpretability for the GOT
framework and also introduces a class of kernel MMD
distances with desirable properties. The theoretical results
are supported by numerical experiments.},
Key = {fds360267}
}
@article{fds368020,
Author = {Miao, Z and Wang, Z and Cheng, X and Qiu, Q},
Title = {Spatiotemporal Joint Filter Decomposition in 3D
Convolutional Neural Networks},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {5},
Pages = {3376-3388},
Year = {2021},
Month = {January},
ISBN = {9781713845393},
Abstract = {In this paper, we introduce spatiotemporal joint filter
decomposition to decouple spatial and temporal learning,
while preserving spatiotemporal dependency in a video. A 3D
convolutional filter is now jointly decomposed over a set of
spatial and temporal filter atoms respectively. In this way,
a 3D convolutional layer becomes three: a temporal atom
layer, a spatial atom layer, and a joint coefficient layer,
all three remaining convolutional. One obvious arithmetic
manipulation allowed in our joint decomposition is to swap
spatial or temporal atoms with a set of atoms that have the
same number but different sizes, while keeping the remaining
unchanged. For example, as shown later, we can now achieve
tempo-invariance by simply dilating temporal atoms only. To
illustrate this useful atom-swapping property, we further
demonstrate how such a decomposition permits the direct
learning of 3D CNNs with full-size videos through iterations
of two consecutive sub-stages of learning: In the temporal
stage, full-temporal downsampled-spatial data are used to
learn temporal atoms and joint coefficients while fixing
spatial atoms. In the spatial stage, full-spatial
downsampled-temporal data are used for spatial atoms and
joint coefficients while fixing temporal atoms. We show
empirically on multiple action recognition datasets that,
the decoupled spatiotemporal learning significantly reduces
the model memory footprints, and allows deep 3D CNNs to
model high-spatial long-temporal dependency with limited
computational resources while delivering comparable
performance.},
Key = {fds368020}
}
@article{fds368021,
Author = {Cheng, X and Xie, Y},
Title = {Neural Tangent Kernel Maximum Mean Discrepancy},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {9},
Pages = {6658-6670},
Year = {2021},
Month = {January},
ISBN = {9781713845393},
Abstract = {We present a novel neural network Maximum Mean Discrepancy
(MMD) statistic by identifying a new connection between
neural tangent kernel (NTK) and MMD. This connection enables
us to develop a computationally efficient and
memory-efficient approach to compute the MMD statistic and
perform NTK based two-sample tests towards addressing the
long-standing challenge of memory and computational
complexity of the MMD statistic, which is essential for
online implementation to assimilating new samples.
Theoretically, such a connection allows us to understand the
NTK test statistic properties, such as the Type-I error and
testing power for performing the two-sample test, by
adapting existing theories for kernel MMD. Numerical
experiments on synthetic and real-world datasets validate
the theory and demonstrate the effectiveness of the proposed
NTK-MMD statistic.},
Key = {fds368021}
}
@article{fds370995,
Author = {Cheng, X and Miao, Z and Qiu, Q},
Title = {Graph Convolution with Low-rank Learn-able Local
Filters},
Journal = {ICLR 2021 - 9th International Conference on Learning
Representations},
Year = {2021},
Month = {January},
Abstract = {Geometric variations like rotation, scaling, and viewpoint
changes pose a significant challenge to visual
understanding. One common solution is to directly model
certain intrinsic structures, e.g., using landmarks.
However, it then becomes non-trivial to build effective deep
models, especially when the underlying non-Euclidean grid is
irregular and coarse. Recent deep models using graph
convolutions provide an appropriate framework to handle such
non-Euclidean data, but many of them, particularly those
based on global graph Laplacians, lack expressiveness to
capture local features required for representation of
signals lying on the non-Euclidean grid. The current paper
introduces a new type of graph convolution with learnable
low-rank local filters, which is provably more expressive
than previous spectral graph convolution methods. The model
also provides a unified framework for both spectral and
spatial graph convolutions. To improve model robustness,
regularization by local graph Laplacians is introduced. The
representation stability against input graph data
perturbation is theoretically proved, making use of the
graph filter locality and the local graph regularization.
Experiments on spherical mesh data, real-world facial
expression recognition/skeleton-based action recognition
data, and data with simulated graph noise show the empirical
advantage of the proposed model.},
Key = {fds370995}
}
@article{fds360268,
Author = {Li, Y and Cheng, X and Lu, J},
Title = {Butterfly-net: Optimal function representation based on
convolutional neural networks},
Journal = {Communications in Computational Physics},
Volume = {28},
Number = {5},
Pages = {1838-1885},
Year = {2020},
Month = {November},
url = {http://dx.doi.org/10.4208/CICP.OA-2020-0214},
Abstract = {Deep networks, especially convolutional neural networks
(CNNs), have been successfully applied in various areas of
machine learning as well as to challenging problems in other
scientific and engineering fields. This paper introduces
Butterfly-net, a low-complexity CNN with structured and
sparse cross-channel connections, together with a Butterfly
initialization strategy for a family of networks.
Theoretical analysis of the approximation power of
Butterfly-net to the Fourier representation of input data
shows that the error decays exponentially as the depth
increases. Combining Butterfly-net with a fully connected
neural network, a large class of problems are proved to be
well approximated with network complexity depending on the
effective frequency bandwidth instead of the input
dimension. Regular CNN is covered as a special case in our
analysis. Numerical experiments validate the analytical
results on the approximation of Fourier kernels and energy
functionals of Poisson's equations. Moreover, all
experiments support that training from Butterfly
initialization outperforms training from random
initialization. Also, adding the remaining cross-channel
connections, although significantly increases the parameter
number, does not much improve the post-training accuracy and
is more sensitive to data distribution.},
Doi = {10.4208/CICP.OA-2020-0214},
Key = {fds360268}
}
@article{fds360269,
Author = {Mhaskar, HN and Cheng, X and Cloninger, A},
Title = {A Witness Function Based Construction of Discriminative
Models Using Hermite Polynomials},
Journal = {Frontiers in Applied Mathematics and Statistics},
Volume = {6},
Year = {2020},
Month = {August},
url = {http://dx.doi.org/10.3389/fams.2020.00031},
Abstract = {In machine learning, we are given a dataset of the form
(Formula presented.), drawn as i.i.d. samples from an
unknown probability distribution μ; the marginal
distribution for the xj's being μ*, and the marginals of
the kth class (Formula presented.) possibly overlapping. We
address the problem of detecting, with a high degree of
certainty, for which x we have (Formula presented.) for all
i ≠ k. We propose that rather than using a positive kernel
such as the Gaussian for estimation of these measures, using
a non-positive kernel that preserves a large number of
moments of these measures yields an optimal approximation.
We use multi-variate Hermite polynomials for this purpose,
and prove optimal and local approximation results in a
supremum norm in a probabilistic sense. Together with a
permutation test developed with the same kernel, we prove
that the kernel estimator serves as a “witness function”
in classification problems. Thus, if the value of this
estimator at a point x exceeds a certain threshold, then the
point is reliably in a certain class. This approach can be
used to modify pretrained algorithms, such as neural
networks or nonlinear dimension reduction techniques, to
identify in-class vs out-of-class regions for the purposes
of generative models, classification uncertainty, or finding
robust centroids. This fact is demonstrated in a number of
real world data sets including MNIST, CIFAR10, Science News
documents, and LaLonde data sets.},
Doi = {10.3389/fams.2020.00031},
Key = {fds360269}
}
@article{fds360273,
Author = {Wang, Z and Cheng, X and Sapiro, G and Qiu, Q},
Title = {STOCHASTIC CONDITIONAL GENERATIVE NETWORKS WITH BASIS
DECOMPOSITION},
Journal = {8th International Conference on Learning Representations,
ICLR 2020},
Publisher = {OpenReview.net},
Year = {2020},
Month = {January},
Abstract = {While generative adversarial networks (GANs) have
revolutionized machine learning, a number of open questions
remain to fully understand them and exploit their power. One
of these questions is how to efficiently achieve proper
diversity and sampling of the multi-mode data space. To
address this, we introduce BasisGAN, a stochastic
conditional multi-mode image generator. By exploiting the
observation that a convolutional filter can be well
approximated as a linear combination of a small set of basis
elements, we learn a plug-and-played basis generator to
stochastically generate basis elements, with just a few
hundred of parameters, to fully embed stochasticity into
convolutional filters. By sampling basis elements instead of
filters, we dramatically reduce the cost of modeling the
parameter space with no sacrifice on either image diversity
or fidelity. To illustrate this proposed plug-and-play
framework, we construct variants of BasisGAN based on
state-of-the-art conditional image generation networks, and
train the networks by simply plugging in a basis generator,
without additional auxiliary components, hyperparameters, or
training objectives. The experimental success is
complemented with theoretical results indicating how the
perturbations introduced by the proposed sampling of basis
elements can propagate to the appearance of generated
images.},
Key = {fds360273}
}
@article{fds360271,
Author = {Li, H and Lindenbaum, O and Cheng, X and Cloninger,
A},
Title = {Variational Diffusion Autoencoders with Random Walk
Sampling},
Journal = {Lecture Notes in Computer Science (including subseries
Lecture Notes in Artificial Intelligence and Lecture Notes
in Bioinformatics)},
Volume = {12368 LNCS},
Pages = {362-378},
Year = {2020},
Month = {January},
ISBN = {9783030585914},
url = {http://dx.doi.org/10.1007/978-3-030-58592-1_22},
Abstract = {Variational autoencoders (VAEs) and generative adversarial
networks (GANs) enjoy an intuitive connection to manifold
learning: in training the decoder/generator is optimized to
approximate a homeomorphism between the data distribution
and the sampling space. This is a construction that strives
to define the data manifold. A major obstacle to VAEs and
GANs, however, is choosing a suitable prior that matches the
data topology. Well-known consequences of poorly picked
priors are posterior and mode collapse. To our knowledge, no
existing method sidesteps this user choice. Conversely,
diffusion maps automatically infer the data topology and
enjoy a rigorous connection to manifold learning, but do not
scale easily or provide the inverse homeomorphism (i.e.
decoder/generator). We propose a method (https://github.com/lihenryhfl/vdae)
that combines these approaches into a generative model that
inherits the asymptotic guarantees of diffusion maps while
preserving the scalability of deep models. We prove
approximation theoretic results for the dimension dependence
of our proposed method. Finally, we demonstrate the
effectiveness of our method with various real and synthetic
datasets.},
Doi = {10.1007/978-3-030-58592-1_22},
Key = {fds360271}
}
@article{fds360272,
Author = {Cheng, X and Mishne, G},
Title = {Spectral Embedding Norm: Looking Deep into the Spectrum of
the Graph Laplacian.},
Journal = {SIAM journal on imaging sciences},
Volume = {13},
Number = {2},
Pages = {1015-1048},
Year = {2020},
Month = {January},
url = {http://dx.doi.org/10.1137/18m1283160},
Abstract = {The extraction of clusters from a dataset which includes
multiple clusters and a significant background component is
a non-trivial task of practical importance. In image
analysis this manifests for example in anomaly detection and
target detection. The traditional spectral clustering
algorithm, which relies on the leading <i>K</i> eigenvectors
to detect <i>K</i> clusters, fails in such cases. In this
paper we propose the <i>spectral embedding norm</i> which
sums the squared values of the first <i>I</i> normalized
eigenvectors, where <i>I</i> can be significantly larger
than <i>K</i>. We prove that this quantity can be used to
separate clusters from the background in unbalanced
settings, including extreme cases such as outlier detection.
The performance of the algorithm is not sensitive to the
choice of <i>I</i>, and we demonstrate its application on
synthetic and real-world remote sensing and neuroimaging
datasets.},
Doi = {10.1137/18m1283160},
Key = {fds360272}
}
@article{fds374298,
Author = {Alaifari, R and Cheng, X and Pierce, LB and Steinerberger,
S},
Title = {On matrix rearrangement inequalities},
Journal = {Proceedings of the American Mathematical
Society},
Volume = {148},
Number = {5},
Pages = {1835-1848},
Year = {2020},
Month = {January},
url = {http://dx.doi.org/10.1090/proc/14831},
Abstract = {Given two symmetric and positive semidefinite square
matrices A,B, is it true that any matrix given as the
product of m copies of A and n copies of B in a particular
sequence must be dominated in the spectral norm by the
ordered matrix product AmBn? For example, is ∥ AABAABABB
∥ ≤ ∥AAAAABBBB∥? Drury [Electron J. Linear Algebra
18 (2009), pp. 13 20] has characterized precisely which
disordered words have the property that an inequality of
this type holds for all matrices A,B. However, the
1-parameter family of counterexamples Drury constructs for
these characterizations is comprised of 3×3 matrices, and
thus as stated the characterization applies only for N × N
matrices with N ≤ 3. In contrast, we prove that for 2 × 2
matrices, the general rearrangement inequality holds for all
disordered words. We also show that for larger N ×N
matrices, the general rearrangement inequality holds for all
disordered words for most A,B (in a sense of full measure)
that are sufficiently small perturbations of the
identity.},
Doi = {10.1090/proc/14831},
Key = {fds374298}
}
@article{fds374299,
Author = {Xu, Z and Li, Y and Cheng, X},
Title = {Butterfly-Net2: Simplified Butterfly-Net and Fourier
Transform Initialization},
Journal = {Proceedings of Machine Learning Research},
Volume = {107},
Pages = {431-450},
Year = {2020},
Month = {January},
Abstract = {Structured CNN designed using the prior information of
problems potentially improves efficiency over conventional
CNNs in various tasks in solving PDEs and inverse problems
in signal processing. This paper introduces BNet2, a
simplified Butterfly-Net and inline with the conventional
CNN. Moreover, a Fourier transform initialization is
proposed for both BNet2 and CNN with guaranteed
approximation power to represent the Fourier transform
operator. Experimentally, BNet2 and the Fourier transform
initialization strategy are tested on various tasks,
including approximating Fourier transform operator,
end-to-end solvers of linear and nonlinear PDEs, and
denoising and deblurring of 1D signals. On all tasks, under
the same initialization, BNet2 achieves similar accuracy as
CNN but has fewer parameters. And Fourier transform
initialized BNet2 and CNN consistently improve the training
and testing accuracy over the randomly initialized
CNN.},
Key = {fds374299}
}
@article{fds347406,
Author = {Cheng, X and Cloninger, A and Coifman, RR},
Title = {Two-sample statistics based on anisotropic
kernels},
Journal = {Information and Inference: A Journal of the
IMA},
Publisher = {Oxford University Press (OUP)},
Year = {2019},
Month = {December},
url = {http://dx.doi.org/10.1093/imaiai/iaz018},
Abstract = {<jats:title>Abstract</jats:title> <jats:p>The paper
introduces a new kernel-based Maximum Mean Discrepancy (MMD)
statistic for measuring the distance between two
distributions given finitely many multivariate samples. When
the distributions are locally low-dimensional, the proposed
test can be made more powerful to distinguish certain
alternatives by incorporating local covariance matrices and
constructing an anisotropic kernel. The kernel matrix is
asymmetric; it computes the affinity between $n$ data points
and a set of $n_R$ reference points, where $n_R$ can be
drastically smaller than $n$. While the proposed statistic
can be viewed as a special class of Reproducing Kernel
Hilbert Space MMD, the consistency of the test is proved,
under mild assumptions of the kernel, as long as $\|p-q\|
\sqrt{n} \to \infty $, and a finite-sample lower bound of
the testing power is obtained. Applications to flow
cytometry and diffusion MRI datasets are demonstrated, which
motivate the proposed approach to compare
distributions.</jats:p>},
Doi = {10.1093/imaiai/iaz018},
Key = {fds347406}
}
@article{fds347407,
Author = {Cheng, X and Qiu, Q and Calderbank, R and Sapiro,
G},
Title = {RotDCF: Decomposition of convolutional filters for
rotation-equivariant deep networks},
Year = {2019},
Month = {May},
Abstract = {Explicit encoding of group actions in deep features makes it
possible for convolutional neural networks (CNNs) to handle
global deformations of images, which is critical to success
in many vision tasks. This paper proposes to decompose the
convolutional filters over joint steerable bases across the
space and the group geometry simultaneously, namely a
rotation-equivariant CNN with decomposed convolutional
filters (RotDCF). This decomposition facilitates computing
the joint convolution, which is proved to be necessary for
the group equivariance. It significantly reduces the model
size and computational complexity while preserving
performance, and truncation of the bases expansion serves
implicitly to regularize the filters. On datasets involving
in-plane and out-of-plane object rotations, RotDCF deep
features demonstrate greater robustness and interpretability
than regular CNNs. The stability of the equivariant
representation to input variations is also proved
theoretically. The RotDCF framework can be extended to
groups other than rotations, providing a general approach
which achieves both group equivariance and representation
stability at a reduced model size.},
Key = {fds347407}
}
@article{fds339535,
Author = {Cheng, X and Rachh, M and Steinerberger, S},
Title = {On the diffusion geometry of graph Laplacians and
applications},
Journal = {Applied and Computational Harmonic Analysis},
Volume = {46},
Number = {3},
Pages = {674-688},
Publisher = {Elsevier BV},
Year = {2019},
Month = {May},
url = {http://dx.doi.org/10.1016/j.acha.2018.04.001},
Doi = {10.1016/j.acha.2018.04.001},
Key = {fds339535}
}
@article{fds344623,
Author = {Yan, B and Sarkar, P and Cheng, X},
Title = {Provable estimation of the number of blocks in block
models},
Journal = {Proceedings of the Twenty-First International Conference on
Artificial Intelligence and Statistics (AISTATS'18)},
Volume = {84},
Pages = {1185-1194},
Publisher = {PMLR},
Year = {2018},
Month = {April},
Abstract = {Community detection is a fundamental unsupervised learning
problem for unlabeled networks which has a broad range of
applications. Many community detection algorithms assume
that the number of clusters r is known apriori. In this
paper, we propose an approach based on semi-definite
relaxations, which does not require prior knowledge of model
parameters like many existing convex relaxation methods and
recovers the number of clusters and the clustering matrix
exactly under a broad parameter regime, with probability
tending to one. On a variety of simulated and real data
experiments, we show that the proposed method often
outperforms state-of-the-art techniques for estimating the
number of clusters.},
Key = {fds344623}
}
@article{fds330801,
Author = {Cheng, X and Mishne, G and Steinerberger, S},
Title = {The geometry of nodal sets and outlier detection},
Journal = {Journal of Number Theory},
Volume = {185},
Pages = {48-64},
Publisher = {Elsevier BV},
Year = {2018},
Month = {April},
url = {http://dx.doi.org/10.1016/j.jnt.2017.09.021},
Doi = {10.1016/j.jnt.2017.09.021},
Key = {fds330801}
}
@article{fds339536,
Author = {Qiu, Q and Cheng, X and Calderbank, AR and Sapiro,
G},
Title = {DCFNet: Deep Neural Network with Decomposed Convolutional
Filters.},
Journal = {ICML},
Volume = {80},
Pages = {4195-4204},
Publisher = {PMLR},
Editor = {Dy, JG and Krause, A},
Year = {2018},
Key = {fds339536}
}
@article{fds330800,
Author = {Lu, J and Lu, Y and Wang, X and Li, X and Linderman, GC and Wu, C and Cheng,
X and Mu, L and Zhang, H and Liu, J and Su, M and Zhao, H and Spatz, ES and Spertus, JA and Masoudi, FA and Krumholz, HM and Jiang,
L},
Title = {Prevalence, awareness, treatment, and control of
hypertension in China: data from 1·7 million adults in a
population-based screening study (China PEACE Million
Persons Project)},
Journal = {The Lancet},
Volume = {390},
Number = {10112},
Pages = {2549-2558},
Publisher = {Elsevier BV},
Year = {2017},
Month = {December},
url = {http://dx.doi.org/10.1016/s0140-6736(17)32478-9},
Doi = {10.1016/s0140-6736(17)32478-9},
Key = {fds330800}
}
@article{fds330802,
Author = {Pragier, G and Greenberg, I and Cheng, X and Shkolnisky,
Y},
Title = {A Graph Partitioning Approach to Simultaneous Angular
Reconstitution},
Journal = {IEEE Transactions on Computational Imaging},
Volume = {2},
Number = {3},
Pages = {323-334},
Publisher = {Institute of Electrical and Electronics Engineers
(IEEE)},
Year = {2016},
Month = {September},
url = {http://dx.doi.org/10.1109/tci.2016.2557076},
Doi = {10.1109/tci.2016.2557076},
Key = {fds330802}
}
@article{fds330803,
Author = {Zhang, T and Cheng, X and Singer, A},
Title = {Marčenko–Pastur law for Tyler’s M-estimator},
Journal = {Journal of Multivariate Analysis},
Volume = {149},
Pages = {114-123},
Publisher = {Elsevier BV},
Year = {2016},
Month = {July},
url = {http://dx.doi.org/10.1016/j.jmva.2016.03.010},
Doi = {10.1016/j.jmva.2016.03.010},
Key = {fds330803}
}
@article{fds330805,
Author = {Cheng, X and Shaham, U and Dror, O and Jaffe, A and Nadler, B and Chang, J and Kluger, Y},
Title = {A Deep Learning Approach to Unsupervised Ensemble
Learning},
Journal = {Proceedings of The 33rd International Conference on Machine
Learning},
Volume = {48},
Pages = {30-39},
Publisher = {PMLR},
Year = {2016},
Month = {June},
Key = {fds330805}
}
@article{fds330804,
Author = {Cheng, X and Chen, X and Mallat, S},
Title = {Deep Haar scattering networks},
Journal = {Information and Inference},
Volume = {5},
Number = {2},
Pages = {105-133},
Publisher = {Oxford University Press (OUP)},
Year = {2016},
Month = {June},
url = {http://dx.doi.org/10.1093/imaiai/iaw007},
Doi = {10.1093/imaiai/iaw007},
Key = {fds330804}
}
@article{fds330806,
Author = {Boumal, N and Cheng, X},
Title = {Concentration of the Kirchhoff index for
Erdős–Rényi graphs},
Journal = {Systems & Control Letters},
Volume = {74},
Pages = {74-80},
Publisher = {Elsevier BV},
Year = {2014},
Month = {December},
url = {http://dx.doi.org/10.1016/j.sysconle.2014.10.006},
Doi = {10.1016/j.sysconle.2014.10.006},
Key = {fds330806}
}
@article{fds330807,
Author = {Chen, X and Cheng, X and Mallat, S},
Title = {Unsupervised Deep Haar Scattering on Graphs.},
Journal = {Advances in Neural Information Processing Systems
27},
Pages = {1709-1717},
Editor = {Ghahramani, Z and Welling, M and Cortes, C and Lawrence, ND and Weinberger, KQ},
Year = {2014},
Key = {fds330807}
}
@article{fds330808,
Author = {CHENG, XIUYUAN and SINGER, AMIT},
Title = {The Spectrum of Random Inner-product Kernel
Matrices},
Journal = {Random Matrices: Theory and Applications},
Volume = {02},
Number = {04},
Pages = {1350010-1350010},
Publisher = {World Scientific Pub Co Pte Lt},
Year = {2013},
Month = {October},
url = {http://dx.doi.org/10.1142/S201032631350010X},
Doi = {10.1142/S201032631350010X},
Key = {fds330808}
}
@article{fds330809,
Author = {E, W and Zhou, X and Cheng, X},
Title = {Subcritical bifurcation in spatially extended
systems},
Journal = {Nonlinearity},
Volume = {25},
Number = {3},
Pages = {761-779},
Publisher = {IOP Publishing},
Year = {2012},
Month = {March},
url = {http://dx.doi.org/10.1088/0951-7715/25/3/761},
Doi = {10.1088/0951-7715/25/3/761},
Key = {fds330809}
}
@article{fds330810,
Author = {Cheng, X and Lin, L and E, W and Zhang, P and Shi, A-C},
Title = {Nucleation of Ordered Phases in Block Copolymers},
Journal = {Physical Review Letters},
Volume = {104},
Number = {14},
Publisher = {American Physical Society (APS)},
Year = {2010},
Month = {April},
url = {http://dx.doi.org/10.1103/physrevlett.104.148301},
Doi = {10.1103/physrevlett.104.148301},
Key = {fds330810}
}
@article{fds330811,
Author = {Lin, L and Cheng, X and E, W and Shi, A-C and Zhang,
P},
Title = {A numerical method for the study of nucleation of ordered
phases},
Journal = {Journal of Computational Physics},
Volume = {229},
Number = {5},
Pages = {1797-1809},
Publisher = {Elsevier BV},
Year = {2010},
Month = {March},
url = {http://dx.doi.org/10.1016/j.jcp.2009.11.009},
Doi = {10.1016/j.jcp.2009.11.009},
Key = {fds330811}
}
|