%% Books
@book{fds338546,
Author = {Gelman, A and Carlin, JB and Stern, HS and Dunson, DB and Vehtari, A and Rubin, DB},
Title = {Bayesian data analysis, third edition},
Pages = {1-646},
Year = {2013},
Month = {January},
ISBN = {9781439840955},
Abstract = {Broadening its scope to nonstatisticians, Bayesian Methods
for Data Analysis, Third Edition provides an accessible
introduction to the foundations and applications of Bayesian
analysis. Along with a complete reorganization of the
material, this edition concentrates more on hierarchical
Bayesian modeling as implemented via Markov chain Monte
Carlo (MCMC) methods and related data analytic techniques.
New to the Third Edition • New data examples,
corresponding R and WinBUGS code, and homework problems •
Explicit descriptions and illustrations of hierarchical
modeling-now commonplace in Bayesian data analysis • A new
chapter on Bayesian design that emphasizes Bayesian clinical
trials • A completely revised and expanded section on
ranking and histogram estimation • A new case study on
infectious disease modeling and the 1918 flu epidemic • A
solutions manual for qualifying instructors that contains
solutions, computer code, and associated output for every
homework problem-available both electronically and in print
Ideal for Anyone Performing Statistical Analyses Focusing on
applications from biostatistics, epidemiology, and medicine,
this text builds on the popularity of its predecessors by
making it suitable for even more practitioners and
students.},
Key = {fds338546}
}
%% Papers Published
@article{fds258005,
Author = {Dunson, WA and Paradise, CJ and Dunson, DB},
Title = {Inhibitory effect of low salinity on growth and reproduction
of the estuarine sheepshead minnow, Cyprinodon
variegatus},
Journal = {Copeia},
Volume = {1998},
Number = {1},
Pages = {235-239},
Publisher = {JSTOR},
Year = {1998},
Month = {February},
url = {http://dx.doi.org/10.2307/1447727},
Doi = {10.2307/1447727},
Key = {fds258005}
}
@article{fds258006,
Author = {Dunson, DB},
Title = {Dose-dependent number of implants and implications in
developmental toxicity.},
Journal = {Biometrics},
Volume = {54},
Number = {2},
Pages = {558-569},
Year = {1998},
Month = {June},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.2307/3109763},
Abstract = {This paper proposes a method for assessing risk in
developmental toxicity studies with exposure prior to
implantation. The method proposed in this paper was
developed to account for a dose-dependent trend in the
number of implantation sites per dam, which is a common
problem in studies with exposure prior to implantation.
Toxins may have the effect of interfering with the early
reproductive process, which can prevent implantation in the
uterine wall. An imputation procedure is presented for
estimating the number of potential fetuses by sampling from
the empirical distribution of the number of implants per
litter in the control group. The marginal death outcomes and
the joint malformation and survival outcomes for each
potential fetus can be estimated using multiple imputation
or the chained data augmentation algorithm. Logit models can
then be fit and used to estimate the effect of dose on
reducing the probability of a normal birth. These models
accommodate multiple covariate effects and can be applied to
low-dose extrapolation. A simulation study is done to
evaluate the properties of model-based estimators of the
mean response and the virtually safe dose level (VSD). It
was found that both estimates were good approximations of
the underlying dose effect. A dominant lethal assay data set
(Luning et al., 1966, Mutation Research 3, 444-451) is
analyzed, and the results are compared with those of Rai and
Van Ryzin.},
Doi = {10.2307/3109763},
Key = {fds258006}
}
@article{fds258008,
Author = {Dunson, DB and Weinberg, CR and Perreault, SD and Chapin,
RE},
Title = {Summarizing the motion of self-propelled cells: applications
to sperm motility.},
Journal = {Biometrics},
Volume = {55},
Number = {2},
Pages = {537-543},
Year = {1999},
Month = {June},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.0006-341x.1999.00537.x},
Abstract = {Proper characterization of the motion of spermatozoa is an
important prerequisite for interpreting differences in sperm
motility that might arise from exposure to toxicants.
Patterns of sperm movement can be extremely complex. On the
basis of an exponential model that relates the discretely
approximated curvilinear velocity to the tracking rate, we
develop a statistic that indexes the predictability of the
path for individual sperm. We summarize the path of each
sperm using this and two other statistics: (1) the path
displacement velocity and (2) linearity of movement. We
apply the method to a set of rat sperm tracks representative
of both normal and abnormal motion characteristics.},
Doi = {10.1111/j.0006-341x.1999.00537.x},
Key = {fds258008}
}
@article{fds258007,
Author = {Dunson, DB and Baird, DD and Wilcox, AJ and Weinberg,
CR},
Title = {Day-specific probabilities of clinical pregnancy based on
two studies with imperfect measures of ovulation.},
Journal = {Human reproduction (Oxford, England)},
Volume = {14},
Number = {7},
Pages = {1835-1839},
Year = {1999},
Month = {July},
ISSN = {0268-1161},
url = {http://dx.doi.org/10.1093/humrep/14.7.1835},
Abstract = {Two studies have related the timing of sexual intercourse
(relative to ovulation) to day-specific fecundability. The
first was a study of Catholic couples practising natural
family planning in London in the 1950s and 1960s and the
second was of North Carolina couples attempting to become
pregnant in the early 1980s. The former identified ovulation
based on the ovulatory shift in the basal body temperature,
while the latter used urinary assays of hormones. We use a
statistical model to correct for error in identifying
ovulation and to re-estimate the length of the fertile
window and day-specific fecundabilities. We estimate the
same 6-day fertile interval in both studies after
controlling for error. After adjusting for error both data
sets showed the highest estimate of the probability of
pregnancy on the day prior to ovulation and both fell close
to zero after ovulation. Given that the fertile interval is
before ovulation, methods that anticipate ovulation by
several days (such as the assessment of cervical mucus)
would be particularly useful for couples who want to time
their intercourse either to avoid or facilitate
conception.},
Doi = {10.1093/humrep/14.7.1835},
Key = {fds258007}
}
@article{fds258009,
Author = {Dunson, WA and Dunson, DB},
Title = {Factors influencing growth and survival of the killifish,
Rivulus marmoratus, held inside enclosures in mangrove
swamps},
Journal = {Copeia},
Volume = {1999},
Number = {3},
Pages = {661-668},
Publisher = {JSTOR},
Year = {1999},
Month = {August},
url = {http://dx.doi.org/10.2307/1447598},
Abstract = {We measured growth and survival in field enclosures of
juvenile Rivulus marmoratus under a variety of biotic
(effects of body mass and intraspecific density) and abiotic
conditions (seasonal climatic changes, site-specific
hypoxia). We also tested three different enclosure types,
surface-floating buckets (0.021 m3), and tubes (0.006 m3)
positioned at the surface or on the bottom. Growth rate was
inversely correlated with wet body mass (between 6 and 42
mg) and density (1-16 fish/0.021 m3 enclosure). However,
density did not affect survival. Growth was significantly
lower in tubes placed on the bottom than at the surface.
There were considerable differences in growth and survival
among sites. This likely is due to differences in occurrence
and persistence of hypoxic events. At the Catfish Creek
location (a pool surrounded by black mangroves), the bottom
was routinely hypoxic. At a shallow bay site, hypoxia was
episodic: on the bottom at dawn (O2) < 2 mg/l occurred on
nine of 48 days, with values < 1 mg/l on two of 48 days.
Maximum growth rates (3.5-4%/day) were recorded in February
to May, in comparison with lower values in December to
January. However, low growth rates also occurred in the
spring, probably caused by episodic hypoxia.},
Doi = {10.2307/1447598},
Key = {fds258009}
}
@article{fds258010,
Author = {Dunson, DB and Haseman, JK},
Title = {Modeling tumor onset and multiplicity using transition
models with latent variables.},
Journal = {Biometrics},
Volume = {55},
Number = {3},
Pages = {965-970},
Year = {1999},
Month = {September},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.0006-341x.1999.00965.x},
Abstract = {We describe a method for modeling carcinogenicity from
animal studies where the data consist of counts of the
number of tumors present over time. The research is
motivated by applications to transgenic rodent studies,
which have emerged as an alternative to chronic bioassays
for screening possible carcinogens. In transgenic mouse
studies, the endpoint of interest is frequently skin
papilloma, with weekly examinations determining how many
papillomas each animal has at a particular point in time. It
is assumed that each animal has two unobservable latent
variables at each time point. The first indicates whether or
not the tumors are in a multiplying state and the second is
the potential number of additional tumors if the tumors are
in a multiplying state. The product of these variables
follows a zero-inflated Poisson distribution, and the EM
algorithm can be used to maximize the observed-data
pseudo-likelihood, based on the latent variables. A
generalized estimating equations robust variance estimator
adjusts for dependency among outcomes within individual
animals. The method is applied to testing for a dose-related
trend in both tumor incidence and multiplicity in
carcinogenicity studies.},
Doi = {10.1111/j.0006-341x.1999.00965.x},
Key = {fds258010}
}
@article{fds257885,
Author = {Dunson, DB and Weinberg, CR},
Title = {Accounting for unreported and missing intercourse in human
fertility studies},
Journal = {Statistics in Medicine},
Volume = {19},
Number = {5},
Pages = {665-679},
Year = {2000},
ISSN = {0277-6715},
url = {http://dx.doi.org/10.1002/(SICI)1097-0258(20000315)19:5<665::AID-SIM391>3.0.CO},
Abstract = {In prospective studies of human fertility that attempt to
identify days of ovulation, couples record each day whether
they had intercourse. Depending on the design of the study,
couples either (I) mark the dates of intercourse on a chart
or (II) mark 'yes' or 'no' for each day of the menstrual
cycle. If protocol I is used, intercourse dates that couples
fail to record are indistinguishable from dates of no
intercourse. Consequently, estimates of day-specific
fecundability are biased upwards. If protocol II is used,
data from menstrual cycles with missing intercourse
information must be discarded in order to fit current
fertility models. We propose methods to account for
unreported and missing intercourse under the assumption that
the missingness mechanism is independent of time conditional
on the unobservable true intercourse status. We use probit
mixture models to allow for heterogeneity among couples,
both in fecundability and in the missingness and
non-reporting mechanisms. Markov chain Monte Carlo (MCMC)
techniques are used for Bayesian estimation. The methods are
generally applicable to the analysis of aggregated Bernoulli
outcomes when there is uncertainty in whether a given trial,
out of a series of trials, was completed. We illustrate the
methods by application to two prospective fertility
studies.},
Doi = {10.1002/(SICI)1097-0258(20000315)19:5<665::AID-SIM391>3.0.CO},
Key = {fds257885}
}
@article{fds258013,
Author = {Dunson, DB},
Title = {Models for papilloma multiplicity and regression:
Applications to transgenic mouse studies},
Journal = {Journal of the Royal Statistical Society. Series C: Applied
Statistics},
Volume = {49},
Number = {1},
Pages = {19-30},
Publisher = {WILEY},
Year = {2000},
Month = {January},
url = {http://dx.doi.org/10.1111/1467-9876.00176},
Abstract = {In cancer studies that use transgenic or knockout mice, skin
tumour counts are recorded over time to measure
tumorigenicity. In these studies cancer biologists are
interested in the effect of endogenous and/or exogenous
factors on papilloma onset, multiplicity and regression. In
this paper an analysis of data from a study conducted by the
National Institute of Environmental Health Sciences on the
effect of genetic factors on skin tumorigenesis is
presented. Papilloma multiplicity and regression are
modelled by using Bernoulli, Poisson and binomial latent
variables, each of which can depend on covariates and
previous outcomes. An EM algorithm is proposed for parameter
estimation, and generalized estimating equations adjust for
extra dependence between outcomes within individual animals.
A Cox proportional hazards model is used to describe
covariate effects on the onset of tumours.},
Doi = {10.1111/1467-9876.00176},
Key = {fds258013}
}
@article{fds258014,
Author = {Dunson, DB},
Title = {Bayesian latent variable models for clustered mixed
outcomes},
Journal = {Journal of the Royal Statistical Society. Series B:
Statistical Methodology},
Volume = {62},
Number = {2},
Pages = {355-366},
Publisher = {WILEY},
Year = {2000},
Month = {January},
url = {http://dx.doi.org/10.1111/1467-9868.00236},
Abstract = {A general framework is proposed for modelling clustered
mixed outcomes. A mixture of generalized linear models is
used to describe the joint distribution of a set of
underlying variables, and an arbitrary function relates the
underlying variables to the observed outcomes. The model
accommodates multilevel data structures, general covariate
effects and distinct link functions and error distributions
for each underlying variable. Within the framework proposed,
novel models are developed for clustered multiple binary,
unordered categorical and joint discrete and continuous
outcomes. A Markov chain Monte Carlo sampling algorithm is
described for estimating the posterior distributions of the
parameters and latent variables. Because of the flexibility
of the modelling framework and estimation procedure,
extensions to ordered categorical outcomes and more complex
data structures are straightforward. The methods are
illustrated by using data from a reproductive toxicity
study.},
Doi = {10.1111/1467-9868.00236},
Key = {fds258014}
}
@article{fds257883,
Author = {Dunson, DB and Weinberg, CR},
Title = {Modeling human fertility in the presence of measurement
error.},
Journal = {Biometrics},
Volume = {56},
Number = {1},
Pages = {288-292},
Year = {2000},
Month = {March},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.0006-341x.2000.00288.x},
Abstract = {The probability of conception in a given menstrual cycle is
closely related to the timing of intercourse relative to
ovulation. Although commonly used markers of time of
ovulation are known to be error prone, most fertility models
assume the day of ovulation is measured without error. We
develop a mixture model that allows the day to be
misspecified. We assume that the measurement errors are
i.i.d. across menstrual cycles. Heterogeneity among couples
in the per cycle likelihood of conception is accounted for
using a beta mixture model. Bayesian estimation is
straightforward using Markov chain Monte Carlo techniques.
The methods are applied to a prospective study of couples at
risk of pregnancy. In the absence of validation data or
multiple independent markers of ovulation, the
identifiability of the measurement error distribution
depends on the assumed model. Thus, the results of studies
relating the timing of intercourse to the probability of
conception should be interpreted cautiously.},
Doi = {10.1111/j.0006-341x.2000.00288.x},
Key = {fds257883}
}
@article{fds257884,
Author = {Weinberg, CR and Dunson, DB},
Title = {Some Issues in Assessing Human Fertility},
Journal = {Journal of the American Statistical Association},
Volume = {95},
Number = {449},
Pages = {300-303},
Booktitle = {Statistics in the 21st Century},
Publisher = {Informa UK Limited},
Year = {2000},
Month = {March},
ISBN = {9781420035391},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2000.10473928},
Abstract = {© 2002 by American Statistical Association. One of the
pleasures of working as an applied statistician is the
awareness it brings of the wide diversity of scientific
fields to which our profession contributes critical concepts
and methods. My own awareness was enhanced by accepting the
invitation from the editors of JASA to serve as guest editor
for this section of vignettes celebrating the significant
contributions made by statisticians to the life and medical
sciences in the 20th century. The goal of the project was
not an encyclopedic catalog of all the major developments,
but rather a sampling of some of the most interesting work.
Of the 12 vignettes, 10 focus on particular areas of
application: environmetrics, wildlife populations, animal
breeding, human fertility, toxicology, medical diagnosis,
clinical trials, environmental epidemiology, statistical
genetics, and molecular biology. The two vignettes that
begin the series focus more on methods that have had, or
promise to have, impact across a range of subject matter
areas: survival analysis and causal analysis.},
Doi = {10.1080/01621459.2000.10473928},
Key = {fds257884}
}
@article{fds257882,
Author = {Dunson, DB and Haseman, JK and van Birgelen, AP and Stasiewicz, S and Tennant, RW},
Title = {Statistical analysis of skin tumor data from Tg.AC mouse
bioassays.},
Journal = {Toxicological sciences : an official journal of the Society
of Toxicology},
Volume = {55},
Number = {2},
Pages = {293-302},
Year = {2000},
Month = {June},
url = {http://dx.doi.org/10.1093/toxsci/55.2.293},
Abstract = {New strategies for identifying chemical carcinogens and
assessing risk have been proposed based on the Tg.AC
(zetaglobin promoted v-Ha-ras) transgenic mouse. Preliminary
studies suggest that the Tg. AC mouse bioassay may be an
effective means of quickly evaluating the carcinogenic
potential of a test agent. The skin of the Tg.AC mouse is
genetically initiated, and the induction of epidermal
papillomas in response to dermal or oral exposure to a
chemical agent acts as a reporter phenotype of the activity
of the test chemical. In Tg.AC mouse bioassays, the test
agent is typically applied topically for up to 26 weeks, and
the number of papillomas in the treated area is counted
weekly. Statistical analyses are complicated by
within-animal and serial dependency in the papilloma counts,
survival differences between animals, and missing data. In
this paper, we describe a statistical model for the analysis
of skin tumor data from a Tg.AC mouse bioassay. The model
separates effects on papilloma latency and multiplicity and
accommodates important features of the data, including
variability in expression of the transgene and dependency in
the tumor counts. Methods are described for carcinogenicity
testing and risk assessment. We illustrate our approach
using data from a study of the effect of 2,3,7,
8-tetrachlorodibenzo-p-dioxin (TCDD) exposure on
tumorigenesis.},
Doi = {10.1093/toxsci/55.2.293},
Key = {fds257882}
}
@article{fds258015,
Author = {Dunson, DB},
Title = {Assessing overall risk in reproductive experiments.},
Journal = {Risk analysis : an official publication of the Society for
Risk Analysis},
Volume = {20},
Number = {4},
Pages = {429-437},
Year = {2000},
Month = {August},
url = {http://dx.doi.org/10.1111/0272-4332.204042},
Abstract = {Toxicologists are often interested in assessing the joint
effect of an exposure on multiple reproductive endpoints,
including early loss, fetal death, and malformation.
Exposures that occur prior to mating or extremely early in
development can adversely affect the number of implantation
sites or fetuses that form within each dam and may even
prevent pregnancy. A simple approach for assessing overall
adverse effects in such studies is to consider fetuses or
implants that fail to develop due to exposure as missing
data. The missing data can be imputed, and standard methods
for the analysis of quantal response data can then be used
for quantitative risk assessment or testing. In this
article, a new bias-corrected imputation procedure is
proposed and evaluated. The procedure is straightforward to
implement in standard statistical packages and has excellent
operating characteristics when used in combination with a
marginal model fit with generalized estimating equations.
The methods are applied to data from a reproductive toxicity
study of Nitrofurazone conducted by the National Toxicology
Program.},
Doi = {10.1111/0272-4332.204042},
Key = {fds258015}
}
@article{fds257887,
Author = {Wilcox, AJ and Dunson, D and Baird, DD},
Title = {The timing of the "fertile window" in the menstrual cycle:
day specific estimates from a prospective
study.},
Journal = {BMJ (Clinical research ed.)},
Volume = {321},
Number = {7271},
Pages = {1259-1262},
Year = {2000},
Month = {November},
ISSN = {0959-8146},
url = {http://dx.doi.org/10.1136/bmj.321.7271.1259},
Abstract = {<h4>Objectives</h4>To provide specific estimates of the
likely occurrence of the six fertile days (the "fertile
window") during the menstrual cycle.<h4>Design</h4>Prospective
cohort study.<h4>Participants</h4>221 healthy women who were
planning a pregnancy.<h4>Main outcome measures</h4>The
timing of ovulation in 696 menstrual cycles, estimated using
urinary metabolites of oestrogen and progesterone.<h4>Results</h4>The
fertile window occurred during a broad range of days in the
menstrual cycle. On every day between days 6 and 21, women
had at minimum a 10% probability of being in their fertile
window. Women cannot predict a sporadic late ovulation; 4-6%
of women whose cycles had not yet resumed were potentially
fertile in the fifth week of their cycle.<h4>Conclusions</h4>In
only about 30% of women is the fertile window entirely
within the days of the menstrual cycle identified by
clinical guidelines-that is, between days 10 and 17. Most
women reach their fertile window earlier and others much
later. Women should be advised that the timing of their
fertile window can be highly unpredictable, even if their
cycles are usually regular.},
Doi = {10.1136/bmj.321.7271.1259},
Key = {fds257887}
}
@article{fds258012,
Author = {Dunson, DB and Tindall, KR},
Title = {Bayesian analysis of mutational spectra.},
Journal = {Genetics},
Volume = {156},
Number = {3},
Pages = {1411-1418},
Year = {2000},
Month = {November},
url = {http://dx.doi.org/10.1093/genetics/156.3.1411},
Abstract = {Studies that examine both the frequency of gene mutation and
the pattern or spectrum of mutational changes can be used to
identify chemical mutagens and to explore the molecular
mechanisms of mutagenesis. In this article, we propose a
Bayesian hierarchical modeling approach for the analysis of
mutational spectra. We assume that the total number of
independent mutations and the numbers of mutations falling
into different response categories, defined by location
within a gene and/or type of alteration, follow binomial and
multinomial sampling distributions, respectively. We use
prior distributions to summarize past information about the
overall mutation frequency and the probabilities
corresponding to the different mutational categories. These
priors can be chosen on the basis of data from previous
studies using an approach that accounts for heterogeneity
among studies. Inferences about the overall mutation
frequency, the proportions of mutations in each response
category, and the category-specific mutation frequencies can
be based on posterior distributions, which incorporate past
and current data on the mutant frequency and on DNA sequence
alterations. Methods are described for comparing groups and
for assessing dose-related trends. We illustrate our
approach using data from the literature.},
Doi = {10.1093/genetics/156.3.1411},
Key = {fds258012}
}
@article{fds257886,
Author = {Dunson, DB and Zhou, H},
Title = {A Bayesian Model for Fecundability and Sterility},
Journal = {Journal of the American Statistical Association},
Volume = {95},
Number = {452},
Pages = {1054-1062},
Publisher = {Informa UK Limited},
Year = {2000},
Month = {December},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2000.10474302},
Abstract = {There is increasing evidence that exposure to environmental
toxins during key stages of development can disrupt the
human reproductive system. Such effects have proven
difficult to study due to the many behavioral and biological
factors involved in human reproduction. We analyze data from
a North Carolina fertility study to assess the effect of
prenatal, childhood, and current cigarette smoking exposure
on fecundability and sterility. We use a mixture model that
adjusts for timing and frequency of intercourse and allows
both fecundability and sterility to depend on multiple
covariates. We account for dependency among menstrual cycles
within individual couples using a mixture density for a
latent cycle viability variable. The mixture consists of a
normal distribution describing heterogeneity among fecund
couples with a point mass at 0 for sterile couples. The
resulting distribution is more biologically plausible than
the standard beta density. A Markov chain Monte Carlo scheme
is used for Bayesian estimation of the model. There is some
evidence that spontaneous intrauterine mortality results in
decreased fecundability in subsequent cycles. Both current
cigarette smoking and prenatal exposure of the woman to her
mother's cigarette smoking are shown to be associated with a
decrease in the probability of menstrual cycle viability. ©
2000 Taylor & Francis Group, LLC.},
Doi = {10.1080/01621459.2000.10474302},
Key = {fds257886}
}
@article{fds258011,
Author = {Dunson, DB and Dinse, GE},
Title = {Distinguishing effects on tumor multiplicity and growth rate
in chemoprevention experiments.},
Journal = {Biometrics},
Volume = {56},
Number = {4},
Pages = {1068-1075},
Year = {2000},
Month = {December},
url = {http://dx.doi.org/10.1111/j.0006-341x.2000.01068.x},
Abstract = {In some types of cancer chemoprevention experiments and
short-term carcinogenicity bioassays, the data consist of
the number of observed tumors per animal and the times at
which these tumors were first detected. In such studies,
there is interest in distinguishing between treatment
effects on the number of tumors induced by a known
carcinogen and treatment effects on the tumor growth rate.
Since animals may die before all induced tumors reach a
detectable size, separation of these effects can be
difficult. This paper describes a flexible parametric model
for data of this type. Under our model, the tumor detection
times are realizations of a delayed Poisson process that is
characterized by the age-specific tumor induction rate and a
random latency interval between tumor induction and
detection. The model accommodates distinct treatment and
animal-specific effects on the number of induced tumors
(multiplicity) and the time to tumor detection (growth
rate). A Gibbs sampler is developed for estimation of the
posterior distributions of the parameters. The methods are
illustrated through application to data from a breast cancer
chemoprevention experiment.},
Doi = {10.1111/j.0006-341x.2000.01068.x},
Key = {fds258011}
}
@article{fds257892,
Author = {Dunson, DB and Dinse, GE},
Title = {Bayesian incidence analysis of animal tumorigenicity
data},
Journal = {Journal of the Royal Statistical Society. Series C: Applied
Statistics},
Volume = {50},
Number = {2},
Pages = {125-141},
Publisher = {WILEY},
Year = {2001},
Month = {January},
url = {http://dx.doi.org/10.1111/1467-9876.00224},
Abstract = {Statistical inference about tumorigenesis should focus on
the tumour incidence rate. Unfortunately, in most animal
carcinogenicity experiments, tumours are not observable in
live animals and censoring of the tumour onset times is
informative. In this paper, we propose a Bayesian method for
analysing data from such studies. Our approach focuses on
the incidence of tumours and accommodates occult tumours and
censored onset times without restricting tumour lethality,
relying on cause-of-death data, or requiring interim
sacrifices. We represent the underlying state of nature by a
multistate stochastic process and assume general probit
models for the time-specific transition rates. These models
allow the incorporation of covariates, historical control
data and subjective prior information. The inherent
flexibility of this approach facilitates the interpretation
of results, particularly when the sample size is small or
the data are sparse. We use a Gibbs sampler to estimate the
relevant posterior distributions. The methods proposed are
applied to data from a US National Toxicology Program
carcinogenicity study.},
Doi = {10.1111/1467-9876.00224},
Key = {fds257892}
}
@article{fds257890,
Author = {Dunson, DB and Perreault, SD},
Title = {Factor analytic models of clustered multivariate data with
informative censoring.},
Journal = {Biometrics},
Volume = {57},
Number = {1},
Pages = {302-308},
Year = {2001},
Month = {March},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.0006-341x.2001.00302.x},
Abstract = {This article describes a general class of factor analytic
models for the analysis of clustered multivariate data in
the presence of informative missingness. We assume that
there are distinct sets of cluster-level latent variables
related to the primary outcomes and to the censoring
process, and we account for dependency between these latent
variables through a hierarchical model. A linear model is
used to relate covariates and latent variables to the
primary outcomes for each subunit. A generalized linear
model accounts for covariate and latent variable effects on
the probability of censoring for subunits within each
cluster. The model accounts for correlation within clusters
and within subunits through a flexible factor analytic
framework that allows multiple latent variables and
covariate effects on the latent variables. The structure of
the model facilitates implementation of Markov chain Monte
Carlo methods for posterior estimation. Data from a
spermatotoxicity study are analyzed to illustrate the
proposed approach.},
Doi = {10.1111/j.0006-341x.2001.00302.x},
Key = {fds257890}
}
@article{fds257891,
Author = {Dunson, DB},
Title = {Modeling of changes in tumor burden},
Journal = {Journal of Agricultural, Biological, and Environmental
Statistics},
Volume = {6},
Number = {1},
Pages = {38-48},
Publisher = {Springer Nature},
Year = {2001},
Month = {March},
url = {http://dx.doi.org/10.1198/108571101300325238},
Abstract = {Skin painting studies on transgenic mice have recently been
approved by the Food and Drug Administration (FDA) for
carcinogenicity testing. Data consist of serial skin tumor
counts on the backs of shaved mice in each of several dose
groups. Current methods for assessing the tumorigenicity of
test compounds are based on generalized estimating equations
and require large samples. This paper proposes a new
framework for modeling of the change over time in the
papilloma burden in each mouse. A latent variable underlying
the observed papilloma response is assumed to follow a
generalized linear mixed-effects transition model. The model
accounts for heterogeneity among animals and serial
dependency in the skin tumor counts. Extensions of existing
Markov chain Monte Carlo procedures for Bayesian estimation
in generalized linear mixed models are proposed. The methods
are applied to data from a National Toxicology Program
short-term carcinogenicity study of lauric
acid.},
Doi = {10.1198/108571101300325238},
Key = {fds257891}
}
@article{fds257894,
Author = {Dunson, DB and Weinberg, CR and Baird, DD and Kesner, JS and Wilcox,
AJ},
Title = {Assessing human fertility using several markers of
ovulation.},
Journal = {Statistics in medicine},
Volume = {20},
Number = {6},
Pages = {965-978},
Year = {2001},
Month = {March},
ISSN = {0277-6715},
url = {http://dx.doi.org/10.1002/sim.716},
Abstract = {In modelling human fertility one ideally accounts for timing
of intercourse relative to ovulation. Measurement error in
identifying the day of ovulation can bias estimates of
fecundability parameters and attenuate estimates of
covariate effects. In the absence of a single perfect marker
of ovulation, several error prone markers are sometimes
obtained. In this paper we propose a semi-parametric mixture
model that uses multiple independent markers of ovulation to
account for measurement error. The model assigns each method
of assessing ovulation a distinct non-parametric error
distribution, and corrects bias in estimates of day-specific
fecundability. We use a Monte Carlo EM algorithm for joint
estimation of (i) the error distribution for the markers,
(ii) the error-corrected fertility parameters, and (iii) the
couple-specific random effects. We apply the methods to data
from a North Carolina fertility study to assess the
magnitude of error in measures of ovulation based on urinary
luteinizing hormone and metabolites of ovarian hormones, and
estimate the corrected day-specific probabilities of
clinical pregnancy. Published in 2001 by John Wiley & Sons,
Ltd.},
Doi = {10.1002/sim.716},
Key = {fds257894}
}
@article{fds257893,
Author = {Wilcox, AJ and Dunson, DB and Weinberg, CR and Trussell, J and Baird,
DD},
Title = {Likelihood of conception with a single act of intercourse:
providing benchmark rates for assessment of post-coital
contraceptives.},
Journal = {Contraception},
Volume = {63},
Number = {4},
Pages = {211-215},
Year = {2001},
Month = {April},
ISSN = {0010-7824},
url = {http://www.ncbi.nlm.nih.gov/pubmed/11376648},
Abstract = {Emergency post-coital contraceptives effectively reduce the
risk of pregnancy, but their degree of efficacy remains
uncertain. Measurement of efficacy depends on the pregnancy
rate without treatment, which cannot be measured directly.
We provide indirect estimates of such pregnancy rates, using
data from a prospective study of 221 women who were
attempting to conceive. We previously estimated the
probability of pregnancy with an act of intercourse relative
to ovulation. In this article, we extend these data to
estimate the probability of pregnancy relative to
intercourse on a given cycle day (counting from onset of
previous menses). In assessing the efficacy of post-coital
contraceptives, other approaches have not incorporated
accurate information on the variability of ovulation. We
find that the possibility of late ovulation produces a
persistent risk of pregnancy even into the sixth week of the
cycle. Post-coital contraceptives may be indicated even when
intercourse has occurred late in the cycle.},
Doi = {10.1016/s0010-7824(01)00191-3},
Key = {fds257893}
}
@article{fds257895,
Author = {Nyska, A and Lomnitski, L and Spalding, J and Dunson, DB and Goldsworthy, TL and Ben-Shaul, V and Grossman, S and Bergman, M and Boorman, G},
Title = {Topical and oral administration of the natural water-soluble
antioxidant from spinach reduces the multiplicity of
papillomas in the Tg.AC mouse model.},
Journal = {Toxicology letters},
Volume = {122},
Number = {1},
Pages = {33-44},
Year = {2001},
Month = {May},
ISSN = {0378-4274},
url = {http://dx.doi.org/10.1016/s0378-4274(01)00345-9},
Abstract = {The Tg.AC mouse carrying the v-Ha-ras structural gene is a
useful model for the study of chemical carcinogens,
especially those acting via non-genotoxic mechanisms. This
study evaluated the efficacy of the non-toxic, water-soluble
antioxidant from spinach, natural antioxidant (NAO), in
reducing skin papilloma induction in female hemizygous Tg.AC
mice treated dermally five times over 2.5 weeks with 2.5
microg 12-O-tetradecanoylphorbol-13-acetate (TPA). The
TPA-only group was considered as a control; the other two
groups received, additionally, NAO topically (2 mg) or
orally (100 mg/kg), 5 days/week for 5 weeks. Papilloma
counts made macroscopically during the clinical observations
showed a significant decrease in multiplicity (P<0.01) in
the NAO topically treated group. According to histological
criteria, papilloma multiplicity were lower in both
topical-NAO and oral-NAO groups, but significantly so only
in the oral-NAO mice (P<0.01). The beneficial effect of NAO
in the Tg.AC mouse is reported.},
Doi = {10.1016/s0378-4274(01)00345-9},
Key = {fds257895}
}
@article{fds257896,
Author = {Dunson, DB and Baird, DD},
Title = {A flexible parametric model for combining current status and
age at first diagnosis data.},
Journal = {Biometrics},
Volume = {57},
Number = {2},
Pages = {396-403},
Year = {2001},
Month = {June},
url = {http://dx.doi.org/10.1111/j.0006-341x.2001.00396.x},
Abstract = {In some cross-sectional studies of chronic disease, data
consist of the age at examination, whether the disease was
present at the exam, and recall of the age at first
diagnosis. This article describes a flexible parametric
approach for combining current status and age at first
diagnosis data. We assume that the log odds of onset by a
given age and of detection by a given age conditional on
onset by that age are nondecreasing functions of time plus
linear combinations of covariates. Piecewise linear models
are used to characterize changes across time in the baseline
odds. Methods are described for accommodating informatively
missing current status data and inferences based on the
age-specific incidence of disease prior to a landmark event
(e.g., puberty, menopause). Our formulation enables
straightforward maximum likelihood estimation without
requiring restrictive parametric or Markov assumptions. The
methods are applied to data from a study of uterine
fibroids.},
Doi = {10.1111/j.0006-341x.2001.00396.x},
Key = {fds257896}
}
@article{fds257897,
Author = {Dunson, DB},
Title = {Commentary: practical advantages of Bayesian analysis of
epidemiologic data.},
Journal = {American journal of epidemiology},
Volume = {153},
Number = {12},
Pages = {1222-1226},
Year = {2001},
Month = {June},
url = {http://dx.doi.org/10.1093/aje/153.12.1222},
Abstract = {In the past decade, there have been enormous advances in the
use of Bayesian methodology for analysis of epidemiologic
data, and there are now many practical advantages to the
Bayesian approach. Bayesian models can easily accommodate
unobserved variables such as an individual's true disease
status in the presence of diagnostic error. The use of prior
probability distributions represents a powerful mechanism
for incorporating information from previous studies and for
controlling confounding. Posterior probabilities can be used
as easily interpretable alternatives to p values. Recent
developments in Markov chain Monte Carlo methodology
facilitate the implementation of Bayesian analyses of
complex data sets containing missing observations and
multidimensional outcomes. Tools are now available that
allow epidemiologists to take advantage of this powerful
approach to assessment of exposure-disease
relations.},
Doi = {10.1093/aje/153.12.1222},
Key = {fds257897}
}
@article{fds257898,
Author = {Robbins, WA and Witt, KL and Haseman, JK and Dunson, DB and Troiani, L and Cohen, MS and Hamilton, CD and Perreault, SD and Libbus, B and Beyler,
SA and Raburn, DJ and Tedder, ST and Shelby, MD and Bishop,
JB},
Title = {Antiretroviral therapy effects on genetic and morphologic
end points in lymphocytes and sperm of men with human
immunodeficiency virus infection.},
Journal = {J Infect Dis},
Volume = {184},
Number = {2},
Pages = {127-135},
Year = {2001},
Month = {July},
ISSN = {0022-1899},
url = {http://www.ncbi.nlm.nih.gov/pubmed/11424008},
Abstract = {Many human immunodeficiency virus (HIV)-infected persons
receive prolonged treatment with DNA-reactive antiretroviral
drugs. A prospective study was conducted of 26 HIV-infected
men who provided samples before treatment and at multiple
times after beginning treatment, to investigate effects of
antiretrovirals on lymphocyte and sperm chromosomes and
semen quality. Several antiretroviral regimens, all
including a nucleoside component, were used. Lymphocyte
metaphase analysis and sperm fluorescence in situ
hybridization were used for cytogenetic studies. Semen
analyses included conventional parameters (volume,
concentration, viability, motility, and morphology). No
significant effects on cytogenetic parameters, semen volume,
or sperm concentration were detected. However, there were
significant improvements in sperm motility for men with
study entry CD4 cell counts >200 cells/mm(3), sperm
morphology for men with entry CD4 cell counts < or =200
cells/mm(3), and the percentage of viable sperm in both
groups. These findings suggest that nucleoside-containing
antiretrovirals administered via recommended protocols do
not induce chromosomal changes in lymphocytes or sperm but
may produce improvements in semen quality.},
Doi = {10.1086/322002},
Key = {fds257898}
}
@article{fds257899,
Author = {Nyska, A and Lomnitski, L and Spalding, J and Dunson, DB and Goldsworthy, TL and Ben-Shaul, V and Grossman, S and Bergman, M and Boorman, G},
Title = {Erratum: Topical and oral administration of the natural
water-soluble antioxidant from spinach reduces the
multiplicity of papillomas in the Tg.AC mouse model
(Toxicology Letters (2001) 122 (33-44) PII:
S0378427401003459)},
Journal = {Toxicology Letters},
Volume = {123},
Number = {2-3},
Pages = {237},
Publisher = {Elsevier BV},
Year = {2001},
Month = {September},
ISSN = {0378-4274},
url = {http://dx.doi.org/10.1016/S0378-4274(01)00417-9},
Doi = {10.1016/S0378-4274(01)00417-9},
Key = {fds257899}
}
@article{fds257901,
Author = {Wilcox, AJ and Baird, DD and Dunson, D and McChesney, R and Weinberg,
CR},
Title = {Natural limits of pregnancy testing in relation to the
expected menstrual period.},
Journal = {JAMA},
Volume = {286},
Number = {14},
Pages = {1759-1761},
Year = {2001},
Month = {October},
ISSN = {0098-7484},
url = {http://dx.doi.org/10.1001/jama.286.14.1759},
Abstract = {<h4>Context</h4>Pregnancy test kits routinely recommend
testing "as early as the first day of the missed period."
However, a pregnancy cannot be detected before the
blastocyst implants. Due to natural variability in the
timing of ovulation, implantation does not necessarily occur
before the expected onset of next menses.<h4>Objective</h4>To
estimate the maximum screening sensitivity of pregnancy
tests when used on the first day of the expected period,
taking into account the natural variability of ovulation and
implantation.<h4>Design and setting</h4>Community-based
prospective cohort study conducted in North Carolina between
1982 and 1986.<h4>Participants</h4>Two hundred twenty-one
healthy women 21 to 42 years of age who were planning to
conceive.<h4>Main outcome measures</h4>Day of implantation,
defined by the serial assay of first morning urine samples
using an extremely sensitive immunoradiometric assay for
human chorionic gonadotropin (hCG), relative to the first
day of the missed period, defined as the day on which women
expected their next menses to begin, based on self-reported
usual cycle length.<h4>Results</h4>Data were available for
136 clinical pregnancies conceived during the study, 14
(10%) of which had not yet implanted by the first day of the
missed period. The highest possible screening sensitivity
for an hCG-based pregnancy test therefore is estimated to be
90% (95% confidence interval [CI], 84%-94%) on the first day
of the missed period. By 1 week after the first day of the
missed period, the highest possible screening sensitivity is
estimated to be 97% (95% CI, 94%-99%).<h4>Conclusions</h4>In
this study, using an extremely sensitive assay for hCG, 10%
of clinical pregnancies were undetectable on the first day
of missed menses. In practice, an even larger percentage of
clinical pregnancies may be undetected by current test kits
on this day, given their reported assay properties and other
practical limitations.},
Doi = {10.1001/jama.286.14.1759},
Key = {fds257901}
}
@article{fds257888,
Author = {Dunson, DB and Sinai, I and Colombo, B},
Title = {The relationship between cervical secretions and the daily
probabilities of pregnancy: effectiveness of the TwoDay
Algorithm.},
Journal = {Human reproduction (Oxford, England)},
Volume = {16},
Number = {11},
Pages = {2278-2282},
Year = {2001},
Month = {November},
ISSN = {0268-1161},
url = {http://dx.doi.org/10.1093/humrep/16.11.2278},
Abstract = {<h4>Background</h4>The TwoDay Algorithm is a simple method
for identifying the fertile window. It classifies a day as
fertile if cervical secretions are present on that day or
were present on the day before. This approach may be an
effective alternative to the ovulation and symptothermal
methods for populations and programmes that find current
natural family planning methods difficult to
implement.<h4>Methods</h4>We used data on secretions from a
large multinational European fecundability study to assess
the relationship between the days predicted to be
potentially fertile by the TwoDay Algorithm and the
day-specific probabilities of pregnancy based on intercourse
patterns in 434 conception cycles from the
study.<h4>Results</h4>The days around ovulation that had the
highest fecundability were the days most likely to be
classified as fertile by the TwoDay Algorithm. In addition,
intercourse on a particular day in the fertile interval was
twice as likely to result in a pregnancy if cervical
secretions were present on that day or the day
before.<h4>Conclusions</h4>The TwoDay Algorithm is
effective, both in identifying the fertile days of the cycle
and in predicting days within the fertile interval that have
a high pregnancy rate. Our data provide the first direct
evidence that cervical secretions are associated with higher
fecundability within the fertile window.},
Doi = {10.1093/humrep/16.11.2278},
Key = {fds257888}
}
@article{fds257889,
Author = {Dunson, DB},
Title = {Bayesian modeling of the level and duration of fertility in
the menstrual cycle.},
Journal = {Biometrics},
Volume = {57},
Number = {4},
Pages = {1067-1073},
Year = {2001},
Month = {December},
url = {http://dx.doi.org/10.1111/j.0006-341x.2001.01067.x},
Abstract = {Time to pregnancy studies that identify ovulation days and
collect daily intercourse data can be used to estimate the
day-specific probabilities of conception given intercourse
on a single day relative to ovulation. In this article, a
Bayesian semiparametric model is described for flexibly
characterizing covariate effects and heterogeneity among
couples in daily fecundability. The proposed model is
characterized by the timing of the most fertile day of the
cycle relative to ovulation, by the probability of
conception due to intercourse on the most fertile day, and
by the ratios of the daily conception probabilities for
other days of the cycle relative to this peak probability.
The ratios are assumed to be increasing in time to the peak
and decreasing thereafter. Generalized linear mixed models
are used to incorporate covariate and couple-specific
effects on the peak probability and on the day-specific
ratios. A Markov chain Monte Carlo algorithm is described
for posterior estimation, and the methods are illustrated
through application to caffeine data from a North Carolina
pregnancy study.},
Doi = {10.1111/j.0006-341x.2001.01067.x},
Key = {fds257889}
}
@article{fds257905,
Author = {Dollé, MET and Snyder, WK and Dunson, DB and Vijg,
J},
Title = {Mutational fingerprints of aging.},
Journal = {Nucleic acids research},
Volume = {30},
Number = {2},
Pages = {545-549},
Year = {2002},
Month = {January},
ISSN = {0305-1048},
url = {http://dx.doi.org/10.1093/nar/30.2.545},
Abstract = {Using a lacZ plasmid transgenic mouse model, spectra of
spontaneous point mutations were determined in brain, heart,
liver, spleen and small intestine in young and old mice.
While similar at a young age, the mutation spectra among
these organs were significantly different in old age. In
brain and heart G:C-->A:T transitions at CpG sites were the
predominant mutation, suggesting that oxidative damage is
not a major mutagenic event in these tissues. Other base
changes, especially those affecting A:T base pairs,
positively correlated with increasing proliferative activity
of the different tissues. A relatively high percentage of
base changes at A:T base pairs and compound mutants were
found in both spleen and spontaneous lymphoma, suggesting a
possible role of the hypermutation process in splenocytes in
carcinogenesis. The similar mutant spectra observed at a
young age may reflect a common mutation mechanism for all
tissues that could be driven by the rapid cell division that
takes place during development. However, the spectra of the
young tissues did not resemble that of the most
proliferative aged tissue, implying that replicative history
per se is not the underlying causal factor of age-related
organ-specific differences in mutation spectra. Rather,
differences in organ function, possibly in association with
replicative history, may explain the divergence in mutation
spectra during aging.},
Doi = {10.1093/nar/30.2.545},
Key = {fds257905}
}
@article{fds257902,
Author = {Dunson, DB and Dinse, GE},
Title = {Bayesian models for multivariate current status data with
informative censoring.},
Journal = {Biometrics},
Volume = {58},
Number = {1},
Pages = {79-88},
Year = {2002},
Month = {March},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.0006-341x.2002.00079.x},
Abstract = {Multivariate current status data, consist of indicators of
whether each of several events occur by the time of a single
examination. Our interest focuses on inferences about the
joint distribution of the event times. Conventional methods
for analysis of multiple event-time data cannot be used
because all of the event times are censored and censoring
may be informative. Within a given subject, we account for
correlated event times through a subject-specific latent
variable, conditional upon which the various events are
assumed to occur independently. We also assume that each
event contributes independently to the hazard of censoring.
Nonparametric step functions are used to characterize the
baseline distributions of the different event times and of
the examination times. Covariate and subject-specific
effects are incorporated through generalized linear models.
A Markov chain Monte Carlo algorithm is described for
estimation of the posterior distributions of the unknowns.
The methods are illustrated through application to multiple
tumor site data from an animal carcinogenicity
study.},
Doi = {10.1111/j.0006-341x.2002.00079.x},
Key = {fds257902}
}
@article{fds257904,
Author = {Dunson, DB and Baird, DD},
Title = {A proportional hazards model for incidence and induced
remission of disease.},
Journal = {Biometrics},
Volume = {58},
Number = {1},
Pages = {71-78},
Year = {2002},
Month = {March},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.0006-341x.2002.00071.x},
Abstract = {To assess the protective effects of a time-varying
covariate, we develop a stochastic model based on tumor
biology. The model assumes that individuals have a
Poisson-distributed pool of initiated clones, which progress
through predetectable, detectable mortal and detectable
immortal stages. Time-independent covariates are
incorporated through a log-linear model for the expected
number of clones, resulting in a proportional hazards model
for disease onset. By allowing time-dependent covariates to
induce clone death, with rate dependent on a clone's state,
the model is flexible enough to accommodate delayed disease
onset and remission or cure of preexisting disease.
Inference uses Bayesian methods via Markov chain Monte
Carlo. Theoretical properties are derived, and the approach
is illustrated through analysis of the effects of childbirth
on uterine leiomyoma (fibroids).},
Doi = {10.1111/j.0006-341x.2002.00071.x},
Key = {fds257904}
}
@article{fds257903,
Author = {Dunson, DB and Colombo, B and Baird, DD},
Title = {Changes with age in the level and duration of fertility in
the menstrual cycle.},
Journal = {Human reproduction (Oxford, England)},
Volume = {17},
Number = {5},
Pages = {1399-1403},
Year = {2002},
Month = {May},
ISSN = {0268-1161},
url = {http://dx.doi.org/10.1093/humrep/17.5.1399},
Abstract = {<h4>Background</h4>Most analyses of age-related changes in
fertility cannot separate effects due to reduced frequency
of sexual intercourse from effects directly related to
ageing. Information on intercourse collected daily through
each menstrual cycle provides the data for estimating
day-specific probabilities of pregnancy for specific days
relative to ovulation, and these estimates allow
unconfounded analysis of ageing effects.<h4>Methods</h4>A
total of 782 healthy couples using natural family planning
methods contributed prospective data on 5860 menstrual
cycles. Day of ovulation was based on basal body temperature
measurements. Estimates of day-specific probabilities of
pregnancy and the length of the fertile window were compared
across age groups.<h4>Results</h4>Nearly all pregnancies
occurred within a 6 day fertile window. There was no
evidence for a shorter fertile window in older men or women.
On average, the day-specific probabilities of pregnancy
declined with age for women from the late 20s onward, with
probabilities of pregnancy twice as high for women aged
19-26 years compared with women aged 35-39 years.
Controlling for age of the woman, fertility was
significantly reduced for men aged >35 years.<h4>Conclusions</h4>Women's
fertility begins to decline in the late 20s with substantial
decreases by the late 30s. Fertility for men is less
affected by age, but shows significant decline by the late
30s.},
Doi = {10.1093/humrep/17.5.1399},
Key = {fds257903}
}
@article{fds257906,
Author = {Tiano, HF and Loftin, CD and Akunda, J and Lee, CA and Spalding, J and Sessoms, A and Dunson, DB and Rogan, EG and Morham, SG and Smart, RC and Langenbach, R},
Title = {Deficiency of either cyclooxygenase (COX)-1 or COX-2 alters
epidermal differentiation and reduces mouse skin
tumorigenesis.},
Journal = {Cancer research},
Volume = {62},
Number = {12},
Pages = {3395-3401},
Year = {2002},
Month = {June},
Abstract = {Nonsteroidal anti-inflammatory drugs are widely reported to
inhibit carcinogenesis in humans and in rodents. These drugs
are believed to act by inhibiting one or both of the known
isoforms of cyclooxygenase (COX). However, COX-2, and not
COX-1, is the isoform most frequently reported to have a key
role in tumor development. Here we report that homozygous
deficiency of either COX-1 or COX-2 reduces skin
tumorigenesis by 75% in a multistage mouse skin model.
Reduced tumorigenesis was observed even though the levels of
stable 7,12-dimethylbenz(a)anthracene-DNA adducts were
increased about 2-fold in the COX-deficient mice compared
with wild-type mice. The premature onset of keratinocyte
terminal differentiation appeared to be the cellular event
leading to the reduced tumorigenesis because keratin 1 and
keratin 10, two keratins that indicate the commitment of
keratinocytes to differentiate, were expressed 8-13-fold and
10-20-fold more frequently in epidermal basal cells of the
COX-1-deficient and COX-2-deficient mice, respectively, than
in wild-type mice. Papillomas on the COX-deficient mice also
displayed the premature onset of keratinocyte terminal
differentiation. However, loricrin, a late marker of
epidermal differentiation, was not significantly altered,
suggesting that it was the early stages of keratinocyte
differentiation that were primarily affected by COX
deficiency. Because keratin 5, a keratin associated with
basal cells, was detected differently in papillomas of
COX-1-deficient as compared with COX-2-deficient mice, it
appears that the isoforms do not have identical roles in
papilloma development. Interestingly, apoptosis, a cellular
process associated with nonsteroidal anti-inflammatory
drug-induced inhibition of tumorigenesis, was not
significantly altered in the epidermis or in papillomas of
the COX-deficient mice. Thus, both COX-1 and COX-2 have
roles in keratinocyte differentiation, and we propose that
the absence of either isoform causes premature terminal
differentiation of initiated keratinocytes and reduced tumor
formation.},
Key = {fds257906}
}
@article{fds257900,
Author = {Mikolajczyk, R},
Title = {TwoDay Algorithm in predicting fertile time.},
Journal = {Human reproduction (Oxford, England)},
Volume = {17},
Number = {7},
Pages = {1925},
Year = {2002},
Month = {July},
ISSN = {0268-1161},
url = {http://dx.doi.org/10.1093/humrep/17.7.1925},
Doi = {10.1093/humrep/17.7.1925},
Key = {fds257900}
}
@article{fds257907,
Author = {Zeise, L and Hattis, D and Andersen, M and Bailer, AJ and Bayard, S and Chen, C and Clewell, H and Conolly, R and Crump, K and Dunson, D and Finkel, A and Haber, L and Jarabek, AM and Kodell, R and Krewski, D and Thomas, D and Thorslund, T and Wassell, JT},
Title = {Improving risk Assessment: Research opportunities in dose
response modeling to improve risk assessment},
Journal = {Human and Ecological Risk Assessment},
Volume = {8},
Number = {6},
Pages = {1421-1444},
Publisher = {Informa UK Limited},
Year = {2002},
Month = {October},
ISSN = {1080-7039},
url = {http://dx.doi.org/10.1080/20028091057448},
Abstract = {Substantial improvements in dose response modeling for risk
assessment may result from recent and continuing advances in
biological research, biochemical techniques,
biostatistical/mathematical methods and computational power.
This report provides a ranked set of recommendations for
proposed research to advance the state of the art in dose
response modeling. The report is the result of a meeting of
invited workgroup participants charged with identifying five
areas of research in dose response modeling that could be
incorporated in a national agenda to improve risk assessment
methods. Leading topics of emphasis are interindividual
variability, injury risk assessment modeling, and procedures
to incorporate distributional methods and mechanistic
considerations into now-standard methods of deriving a
reference dose (RfD), reference concentration (RfC), minimum
risk level (MRL) or similar dose-response parameter
estimates. © 2002 by ASP.},
Doi = {10.1080/20028091057448},
Key = {fds257907}
}
@article{fds257908,
Author = {Dunson, B and Baird, DD},
Title = {Bayesian modeling of incidence and progression of disease
from cross-sectional data.},
Journal = {Biometrics},
Volume = {58},
Number = {4},
Pages = {813-822},
Year = {2002},
Month = {December},
url = {http://dx.doi.org/10.1111/j.0006-341x.2002.00813.x},
Abstract = {In the absence of longitudinal data, the current presence
and severity of disease can be measured for a sample of
individuals to investigate factors related to disease
incidence and progression. In this article, Bayesian
discrete-time stochastic models are developed for inference
from cross-sectional data consisting of the age at first
diagnosis, the current presence of disease, and one or more
surrogates of disease severity. Semiparametric models are
used for the age-specific hazards of onset and diagnosis,
and a normal underlying variable approach is proposed for
modeling of changes with latency time in disease severity.
The model accommodates multiple surrogates of disease
severity having different measurement scales and
heterogeneity among individuals in disease progression. A
Markov chain Monte Carlo algorithm is described for
posterior computation, and the methods are applied to data
from a study of uterine leiomyoma.},
Doi = {10.1111/j.0006-341x.2002.00813.x},
Key = {fds257908}
}
@article{fds257909,
Author = {Baird, DD and Dunson, DB and Hill, MC and Cousins, D and Schectman,
JM},
Title = {High cumulative incidence of uterine leiomyoma in black and
white women: ultrasound evidence.},
Journal = {American journal of obstetrics and gynecology},
Volume = {188},
Number = {1},
Pages = {100-107},
Year = {2003},
Month = {January},
url = {http://dx.doi.org/10.1067/mob.2003.99},
Abstract = {<h4>Objective</h4>Uterine leiomyoma, or fibroid tumors, are
the leading indication for hysterectomy in the United
States, but the proportion of women in whom fibroid tumors
develop is not known. This study screened for fibroid
tumors, independently of clinical symptoms, to estimate the
age-specific proportion of black and white women in whom
fibroid tumors develop.<h4>Study design</h4>Randomly
selected members of an urban health plan who were 35 to 49
years old participated (n = 1364 women). Medical records and
self-report were used to assess fibroid status for those
women who were no longer menstruating (most of whom had had
hysterectomies). Premenopausal women were screened by
ultrasonography. We estimated the age-specific cumulative
incidence of fibroid tumors for black and white
women.<h4>Results</h4>Thirty-five percent of premenopausal
women had a previous diagnosis of fibroid tumors. Fifty-one
percent of the premenopausal women who had no previous
diagnosis had ultrasound evidence of fibroid tumors. The
estimated cumulative incidence of tumors by age 50 was >80%
for black women and nearly 70% for white women. The
difference between the age-specific cumulative incidence
curves for black and white women was highly significant
(odds ratio, 2.9; 95% CI, 2.5-3.4; P <.001).<h4>Conclusion</h4>The
results of this study suggest that most black and white
women in the United States develop uterine fibroid tumors
before menopause and that uterine fibroid tumors develop in
black women at earlier ages than in white
women.},
Doi = {10.1067/mob.2003.99},
Key = {fds257909}
}
@article{fds257911,
Author = {Chulada, PC and Arbes, SJ and Dunson, D and Zeldin,
DC},
Title = {Breast-feeding and the prevalence of asthma and wheeze in
children: analyses from the Third National Health and
Nutrition Examination Survey, 1988-1994.},
Journal = {The Journal of allergy and clinical immunology},
Volume = {111},
Number = {2},
Pages = {328-336},
Year = {2003},
Month = {February},
url = {http://dx.doi.org/10.1067/mai.2003.127},
Abstract = {<h4>Background</h4>Asthma prevalence has increased
dramatically in recent years, especially among children.
Breast-feeding might protect children against asthma and
related conditions (recurrent wheeze), and this protective
effect might depend on the duration and exclusivity of the
breast-feeding regimen.<h4>Objective</h4>We sought to
determine whether there is an association between
breast-feeding and asthma, recurrent wheeze, or both in
children up to 72 months of age and whether the duration and
exclusivity of breast-feeding affect this
association.<h4>Methods</h4>Data were from the third
National Health and Nutrition Examination Survey, a
nationally representative cross-sectional survey conducted
from 1988 to 1994. We tested for significant associations
between breast-feeding and physician-diagnosed asthma and
recurrent wheeze (> or =3 episodes in the past 12 months)
before and after adjusting for potential
confounders.<h4>Results</h4>Crude analyses showed that
breast-feeding was associated with significantly reduced
risks for asthma and recurrent wheeze in children 2 to 71
months of age, but after adjusting for potential
confounders, these overall protective associations
attenuated and were no longer statistically significant.
However, 2 new and important associations were revealed
after adjusting for confounders: (1) compared with never
breast-fed children, ever breast-fed children had
significantly reduced odds of being diagnosed with asthma
and of having recurrent wheeze before 24 months of age, and
(2) among children 2 to 71 months of age who had been
exposed to environmental tobacco smoke, those who had ever
been breast-fed had significantly reduced risks of asthma
and wheeze compared with those who had never been
breast-fed.<h4>Conclusions</h4>Breast-feeding might delay
the onset of or actively protect children less than 24
months of age against asthma and recurrent wheeze.
Breast-feeding might reduce the prevalence of asthma and
recurrent wheeze in children exposed to environmental
tobacco smoke.},
Doi = {10.1067/mai.2003.127},
Key = {fds257911}
}
@article{fds257910,
Author = {Dunson, DB and Colombo, B},
Title = {Bayesian modeling of markers of day-specific
fertility},
Journal = {Journal of the American Statistical Association},
Volume = {98},
Number = {461},
Pages = {28-37},
Publisher = {Informa UK Limited},
Year = {2003},
Month = {March},
url = {http://dx.doi.org/10.1198/016214503388619067},
Abstract = {Cervical mucus hydration increases during the fertile
interval before ovulation. Because sperm can only penetrate
mucus having a high water content, cervical secretions
provide a reliable marker of the fertile days of the
menstrual cycle. This article develops a Bayesian approach
for modeling of daily observations of cervical mucus and
applies the approach to assess heterogeneity among women and
cycles from a given woman with respect to the increase in
mucus hydration during the fertile interval. The proposed
model relates the mucus observations to an underlying normal
mucus hydration score, which varies relative to a peak
hydration day. Uncertainty in the timing of the peak is
accounted for, and a novel weighted mixture model is used to
characterize heterogeneity in distinct features of the
underlying mean function. Prior information on the mucus
hydration trajectory is incorporated, and a Markov chain
Monte Carlo approach is developed. Based on data from a
study of daily fecundability, there appears to be
substantial heterogeneity among women in detected
preovulatory increases in mucus hydration, but only minimal
differences among cycles from a given woman.},
Doi = {10.1198/016214503388619067},
Key = {fds257910}
}
@article{fds257912,
Author = {Dunson, DB and Chulada, P and Arbes, SJ},
Title = {Bayesian modeling of time-varying and waning exposure
effects.},
Journal = {Biometrics},
Volume = {59},
Number = {1},
Pages = {83-91},
Year = {2003},
Month = {March},
url = {http://dx.doi.org/10.1111/1541-0420.00010},
Abstract = {In epidemiologic studies, there is often interest in
assessing the association between exposure history and
disease incidence. For many diseases, incidence may depend
not only on cumulative exposure, but also on the ages at
which exposure occurred. This article proposes a flexible
Bayesian approach for modeling age-varying and waning
exposure effects. The Cox model is generalized to allow the
hazard of disease to depend on an integral, across the
exposed ages, of a piecewise polynomial function of age,
multiplied by an exponential decay term. Linearity
properties of the model facilitate posterior computation via
a Gibbs sampler, which generalizes previous algorithms for
Cox regression with time-dependent covariates. The approach
is illustrated by an application to the study of protective
effects of breastfeeding on incidence of childhood
asthma.},
Doi = {10.1111/1541-0420.00010},
Key = {fds257912}
}
@article{fds257913,
Author = {Baird, DD and Dunson, DB},
Title = {Why is parity protective for uterine fibroids?},
Journal = {Epidemiology (Cambridge, Mass.)},
Volume = {14},
Number = {2},
Pages = {247-250},
Year = {2003},
Month = {March},
url = {http://dx.doi.org/10.1097/01.ede.0000054360.61254.27},
Abstract = {Uterine fibroids are benign tumors, the etiology of which is
not understood. Symptoms can be debilitating, and the
primary treatment is surgery, usually hysterectomy.
Epidemiologic data show that pregnancy is associated with
reduced risk of fibroids. We hypothesize that this
association is attributable to a protective effect of
postpartum involution of the uterus. After each pregnancy
the uterus rapidly returns to prepregnancy size by dramatic
remodeling of the tissue. We hypothesize that small fibroids
are eliminated during this process. We present preliminary
epidemiologic evidence that is consistent with this
hypothesis. If the hypothesis is supported by more direct
evidence, it may have broader implications, supporting the
idea that tissue remodeling may be a general mechanism for
limiting tumor development.},
Doi = {10.1097/01.ede.0000054360.61254.27},
Key = {fds257913}
}
@article{fds257914,
Author = {Dunson, DB},
Title = {Incorporating heterogeneous intercourse records into time to
pregnancy models},
Journal = {Mathematical Population Studies},
Volume = {10},
Number = {2},
Pages = {127-143},
Publisher = {Informa UK Limited},
Year = {2003},
Month = {April},
ISSN = {0889-8480},
url = {http://dx.doi.org/10.1080/08898480306714},
Abstract = {Information on the timing of intercourse relative to
ovulation can be incorporated into time to pregnancy models
to improve the power to detect covariate effects, to
estimate the day-specific conception probabilities, and to
distinguish between biological and behavioral effects on
fecundability, and therefore the probability of conception
in a menstrual cycle. In this paper, Bayesian methods are
proposed for joint modeling of intercourse behavior and
biologic fecundability. The model accommodates a sterile
subpopulation of couples, general covariate effects, and
heterogeneity among fecund couples in menstrual cycle
viability and in frequency of unprotected intercourse.
Methods are described for incorporating cycles with varying
amounts of intercourse information into a single analysis. A
Markov chain Monte Carlo algorithm is outlined for
estimation of the posterior distributions of the unknowns.
The methods arc applied to data from a North Carolina study
of couples attempting pregnancy. Copyright © 2003 Taylor
Francis.},
Doi = {10.1080/08898480306714},
Key = {fds257914}
}
@article{fds257915,
Author = {Stanford, JB and Smith, KR and Dunson, DB},
Title = {Vulvar mucus observations and the probability of
pregnancy.},
Journal = {Obstetrics and gynecology},
Volume = {101},
Number = {6},
Pages = {1285-1293},
Year = {2003},
Month = {June},
url = {http://dx.doi.org/10.1016/s0029-7844(03)00358-2},
Abstract = {<h4>Objective</h4>To assess the day-specific and
cycle-specific probabilities of conception leading to
clinical pregnancy, in relation to the timing of intercourse
and vulvar mucus observations.<h4>Methods</h4>This was a
retrospective cohort study of women beginning use of the
Creighton Model Fertility Care System in Missouri, Nebraska,
Kansas, and California. Data were abstracted from Creighton
Model Fertility Care System records, including women's daily
standardized vulvar observations of cervical mucus
discharge, days of intercourse, and clinically evident
pregnancy (conception). Established statistical models were
used to estimate day-specific probabilities of
conception.<h4>Results</h4>Data were analyzed from 1681
cycles with 81 conceptions from 309 normally fertile couples
(initially seeking to avoid pregnancy) and from 373 cycles
with 30 conceptions from 117 subfertile couples (who were
initially trying to achieve pregnancy). The highest
probability of pregnancy occurred on the peak day of vulvar
mucus observation (.38 for normally fertile couples and.14
for subfertile couples). The probability of pregnancy was
greater than.05 for normally fertile couples from 3 days
before to 2 days after the peak, and for subfertile couples
from 1 day before to 1 day after the peak. The
cycle-specific probability of conception correlated with the
quality of mucus discharge in normally fertile couples but
not in subfertile couples.<h4>Conclusion</h4>Standardized
vulvar observations of vaginal mucus discharge identify the
days with the greatest likelihood of conception from
intercourse in normal fertility and subfertility and provide
an indicator of the overall potential for conception in a
given menstrual cycle in normal fertility.},
Doi = {10.1016/s0029-7844(03)00358-2},
Key = {fds257915}
}
@article{fds257917,
Author = {Dunson, DB and Watson, M and Taylor, JA},
Title = {Bayesian latent variable models for median regression on
multiple outcomes.},
Journal = {Biometrics},
Volume = {59},
Number = {2},
Pages = {296-304},
Year = {2003},
Month = {June},
url = {http://dx.doi.org/10.1111/1541-0420.00036},
Abstract = {Often a response of interest cannot be measured directly and
it is necessary to rely on multiple surrogates, which can be
assumed to be conditionally independent given the latent
response and observed covariates. Latent response models
typically assume that residual densities are Gaussian. This
article proposes a Bayesian median regression modeling
approach, which avoids parametric assumptions about residual
densities by relying on an approximation based on quantiles.
To accommodate within-subject dependency, the quantile
response categories of the surrogate outcomes are related to
underlying normal variables, which depend on a latent normal
response. This underlying Gaussian covariance structure
simplifies interpretation and model fitting, without
restricting the marginal densities of the surrogate
outcomes. A Markov chain Monte Carlo algorithm is proposed
for posterior computation, and the methods are applied to
single-cell electrophoresis (comet assay) data from a
genetic toxicology study.},
Doi = {10.1111/1541-0420.00036},
Key = {fds257917}
}
@article{fds257918,
Author = {Dunson, DB and Neelon, B},
Title = {Bayesian inference on order-constrained parameters in
generalized linear models.},
Journal = {Biometrics},
Volume = {59},
Number = {2},
Pages = {286-295},
Year = {2003},
Month = {June},
url = {http://dx.doi.org/10.1111/1541-0420.00035},
Abstract = {In biomedical studies, there is often interest in assessing
the association between one or more ordered categorical
predictors and an outcome variable, adjusting for
covariates. For a k-level predictor, one typically uses
either a k-1 degree of freedom (df) test or a single df
trend test, which requires scores for the different levels
of the predictor. In the absence of knowledge of a
parametric form for the response function, one can
incorporate monotonicity constraints to improve the
efficiency of tests of association. This article proposes a
general Bayesian approach for inference on order-constrained
parameters in generalized linear models. Instead of choosing
a prior distribution with support on the constrained space,
which can result in major computational difficulties, we
propose to map draws from an unconstrained posterior density
using an isotonic regression transformation. This approach
allows flat regions over which increases in the level of a
predictor have no effect. Bayes factors for assessing
ordered trends can be computed based on the output from a
Gibbs sampling algorithm. Results from a simulation study
are presented and the approach is applied to data from a
time-to-pregnancy study.},
Doi = {10.1111/1541-0420.00035},
Key = {fds257918}
}
@article{fds257916,
Author = {Dunson, DB and Chen, Z and Harry, J},
Title = {A Bayesian approach for joint modeling of cluster size and
subunit-specific outcomes.},
Journal = {Biometrics},
Volume = {59},
Number = {3},
Pages = {521-530},
Year = {2003},
Month = {September},
url = {http://dx.doi.org/10.1111/1541-0420.00062},
Abstract = {In applications that involve clustered data, such as
longitudinal studies and developmental toxicity experiments,
the number of subunits within a cluster is often correlated
with outcomes measured on the individual subunits. Analyses
that ignore this dependency can produce biased inferences.
This article proposes a Bayesian framework for jointly
modeling cluster size and multiple categorical and
continuous outcomes measured on each subunit. We use a
continuation ratio probit model for the cluster size and
underlying normal regression models for each of the
subunit-specific outcomes. Dependency between cluster size
and the different outcomes is accommodated through a latent
variable structure. The form of the model facilitates
posterior computation via a simple and computationally
efficient Gibbs sampler. The approach is illustrated with an
application to developmental toxicity data, and other
applications, to joint modeling of longitudinal and event
time data, are discussed.},
Doi = {10.1111/1541-0420.00062},
Key = {fds257916}
}
@article{fds257919,
Author = {Dunson, DB},
Title = {Dynamic Latent Trait Models for Multidimensional
Longitudinal Data},
Journal = {Journal of the American Statistical Association},
Volume = {98},
Number = {463},
Pages = {555-563},
Publisher = {Informa UK Limited},
Year = {2003},
Month = {September},
url = {http://dx.doi.org/10.1198/016214503000000387},
Abstract = {This article presents a new approach for analysis of
multidimensional longitudinal data, motivated by studies
using an item response battery to measure traits of an
individual repeatedly over time. A general modeling
framework is proposed that allows mixtures of count,
categorical, and continuous response variables. Each
response is related to age-specific latent traits through a
generalized linear model that accommodates item-specific
measurement errors. A transition model allows the latent
traits at a given age to depend on observed predictors and
on previous latent traits for that individual. Following a
Bayesian approach to inference, a Markov chain Monte Carlo
algorithm is proposed for posterior computation. The methods
are applied to data from a neurotoxicity study of the
pesticide methoxychlor, and evidence of a dose-dependent
increase in motor activity is presented.},
Doi = {10.1198/016214503000000387},
Key = {fds257919}
}
@article{fds257920,
Author = {Dunson, DB and Herring, AH},
Title = {Bayesian inferences in the Cox model for order-restricted
hypotheses.},
Journal = {Biometrics},
Volume = {59},
Number = {4},
Pages = {916-923},
Year = {2003},
Month = {December},
url = {http://dx.doi.org/10.1111/j.0006-341x.2003.00106.x},
Abstract = {In studying the relationship between an ordered categorical
predictor and an event time, it is standard practice to
include dichotomous indicators of the different levels of
the predictor in a Cox model. One can then use a multiple
degree-of-freedom score or partial likelihood ratio test for
hypothesis testing. Often, interest focuses on comparing the
null hypothesis of no difference to an order-restricted
alternative, such as a monotone increase across levels of a
predictor. This article proposes a Bayesian approach for
addressing hypotheses of this type. We reparameterize the
Cox model in terms of a cumulative product of parameters
having conjugate prior densities, consisting of mixtures of
point masses at one, and truncated gamma densities. Due to
the structure of the model, posterior computation can
proceed via a simple and efficient Gibbs sampling algorithm.
Posterior probabilities for the global null hypothesis and
subhypotheses, comparing the hazards for specific groups,
can be calculated directly from the output of a single Gibbs
chain. The approach allows for level sets across which a
predictor has no effect. Generalizations to multiple
predictors are described, and the method is applied to a
study of emergency medical treatment for
stroke.},
Doi = {10.1111/j.0006-341x.2003.00106.x},
Key = {fds257920}
}
@article{fds257921,
Author = {Chen, Z and Dunson, DB},
Title = {Random effects selection in linear mixed
models.},
Journal = {Biometrics},
Volume = {59},
Number = {4},
Pages = {762-769},
Year = {2003},
Month = {December},
url = {http://dx.doi.org/10.1111/j.0006-341x.2003.00089.x},
Abstract = {We address the important practical problem of how to select
the random effects component in a linear mixed model. A
hierarchical Bayesian model is used to identify any random
effect with zero variance. The proposed approach
reparameterizes the mixed model so that functions of the
covariance parameters of the random effects distribution are
incorporated as regression coefficients on standard normal
latent variables. We allow random effects to effectively
drop out of the model by choosing mixture priors with point
mass at zero for the random effects variances. Due to the
reparameterization, the model enjoys a conditionally linear
structure that facilitates the use of normal conjugate
priors. We demonstrate that posterior computation can
proceed via a simple and efficient Markov chain Monte Carlo
algorithm. The methods are illustrated using simulated data
and real data from a study relating prenatal exposure to
polychlorinated biphenyls and psychomotor development of
children.},
Doi = {10.1111/j.0006-341x.2003.00089.x},
Key = {fds257921}
}
@article{fds257822,
Author = {Trouba, K and Nyska, A and Styblo, M and Dunson, D and Lomnitski, L and Grossman, S and Moser, G and Suttie, A and Patterson, R and Walton, F and Germolec, D},
Title = {Effect of antioxidants on the papilloma response and liver
glutathione modulation mediated by arsenic in tg.ac
transgenic mice},
Journal = {Arsenic Exposure and Health Effects V},
Pages = {283-293},
Publisher = {Elsevier},
Year = {2003},
Month = {December},
url = {http://dx.doi.org/10.1016/B978-044451441-7/50022-1},
Abstract = {Epidemiological studies indicate that inorganic arsenicals
produce various skin lesions as well as skin, lung, bladder,
liver, prostate, and renal cancer. Our laboratory previously
demonstrated that low-dose 12-O-tetradecanoylphorbol-13-acetate
(TPA) increased the number of skin papillomas in Tg.AC
transgenic mice that received sodium arsenite in drinking
water, an effect dependent on proinflammatory cytokines.
Because proinflammatory cytokine expression can be modulated
by free radicals and oxidative stress, we hypothesized that
oxidative stress contributes to TPA-promoted papilloma
development in Tg.AC mice exposed to sodium arsenite. To
evaluate the contribution of oxidative stress to arsenic
skin carcinogenesis, two free-radical scavengers were tested
for their ability to suppress papilloma responses (e.g.
induction, latency, and multiplicity) modulated by arsenite
in Tg.AC mice. Data indicate that arsenite increased
papilloma responses in TPA-promoted Tg.AC mice as compared
to control animals (no arsenite). The antioxidant vitamin E
or a water-soluble natural antioxidant fraction from spinach
had no inhibitory effect on TPA-promoted papilloma responses
following arsenite exposure. Although not conclusively
defined by our studies, oxidative stress generated by
arsenic may contribute to skin carcinogenesis; however, it
is not likely to be the sole or primary mechanism that
enhances papilloma responses following arsenite exposure and
TPA promotion. © 2003 Elsevier B.V.},
Doi = {10.1016/B978-044451441-7/50022-1},
Key = {fds257822}
}
@article{fds257922,
Author = {Tingen, C and Stanford, JB and Dunson, DB},
Title = {Methodologic and statistical approaches to studying human
fertility and environmental exposure.},
Journal = {Environmental health perspectives},
Volume = {112},
Number = {1},
Pages = {87-93},
Year = {2004},
Month = {January},
url = {http://dx.doi.org/10.1289/ehp.6263},
Abstract = {Although there has been growing concern about the effects of
environmental exposures on human fertility, standard
epidemiologic study designs may not collect sufficient data
to identify subtle effects while properly adjusting for
confounding. In particular, results from conventional time
to pregnancy studies can be driven by the many sources of
bias inherent in these studies. By prospectively collecting
detailed records of menstrual bleeding, occurrences of
intercourse, and a marker of ovulation day in each menstrual
cycle, precise information on exposure effects can be
obtained, adjusting for many of the primary sources of bias.
This article provides an overview of the different types of
study designs, focusing on the data required, the practical
advantages and disadvantages of each design, and the
statistical methods required to take full advantage of the
available data. We conclude that detailed prospective
studies allowing inferences on day-specific probabilities of
conception should be considered as the gold standard for
studying the effects of environmental exposures on
fertility.},
Doi = {10.1289/ehp.6263},
Key = {fds257922}
}
@article{fds257923,
Author = {Dunson, DB and Baird, DD and Colombo, B},
Title = {Increased infertility with age in men and
women.},
Journal = {Obstetrics and gynecology},
Volume = {103},
Number = {1},
Pages = {51-56},
Year = {2004},
Month = {January},
url = {http://dx.doi.org/10.1097/01.aog.0000100153.24061.45},
Abstract = {<h4>Objective</h4>To estimate the effects of aging on the
percentage of outwardly healthy couples who are sterile
(completely unable to conceive without assisted
reproduction) or infertile (unable to conceive within a year
of unprotected intercourse).<h4>Methods</h4>A prospective
fecundability study was conducted in a sample of 782 couples
recruited from 7 European centers for natural family
planning. Women aged 18-40 years were eligible. Daily
intercourse records were used to adjust for timing and
frequency of intercourse when estimating the
per-menstrual-cycle probability of conception. The number of
menstrual cycles required to conceive a clinical pregnancy
and the probability of sterility and infertility were
derived from the estimated fecundability distributions for
men and women of different ages.<h4>Results</h4>Sterility
was estimated at about 1%; this percent did not change with
age. The percentage infertility was estimated at 8% for
women aged 19-26 years, 13-14% for women aged 27-34 years
and 18% for women aged 35-39 years. Starting in the late
30s, male age was an important factor, with the percentage
failing to conceive within 12 cycles increasing from an
estimated 18-28% between ages 35 and 40 years. The estimated
percentage of infertile couples that would be able to
conceive after an additional 12 cycles of trying varied from
43-63% depending on age.<h4>Conclusion</h4>Increased
infertility in older couples is attributable primarily to
declines in fertility rates rather than to absolute
sterility. Many infertile couples will conceive if they try
for an additional year.},
Doi = {10.1097/01.aog.0000100153.24061.45},
Key = {fds257923}
}
@article{fds257924,
Author = {Bigelow, JL and Dunson, DB and Stanford, JB and Ecochard, R and Gnoth,
C and Colombo, B},
Title = {Mucus observations in the fertile window: a better predictor
of conception than timing of intercourse.},
Journal = {Human reproduction (Oxford, England)},
Volume = {19},
Number = {4},
Pages = {889-892},
Year = {2004},
Month = {April},
url = {http://dx.doi.org/10.1093/humrep/deh173},
Abstract = {<h4>Background</h4>Intercourse results in a pregnancy
essentially only if it occurs during the 6-day fertile
interval ending on the day of ovulation. The strong
association between timing of intercourse within this
interval and the probability of conception typically is
attributed to limited sperm and egg life
times.<h4>Methods</h4>A total of 782 women recruited from
natural family planning centres in Europe contributed
prospective data on 7288 menstrual cycles. Daily records of
intercourse, basal body temperature and vaginal discharge of
cervical mucus were collected. Probabilities of conception
were estimated according to the timing of intercourse
relative to ovulation and a 1-4 score of mucus
quality.<h4>Results</h4>There was a strong increasing trend
in the day-specific probabilities of pregnancy with
increases in the mucus score. Adjusting for the mucus score,
the day-specific probabilities had limited variability
across the fertile interval.<h4>Conclusions</h4>Changes in
mucus quality across the fertile interval predict the
observed pattern in the day-specific probabilities of
conception. To maximize the likelihood of conception,
intercourse should occur on days with optimal mucus quality,
as observed in vaginal discharge, regardless of the exact
timing relative to ovulation.},
Doi = {10.1093/humrep/deh173},
Key = {fds257924}
}
@article{fds257925,
Author = {Neelon, B and Dunson, DB},
Title = {Bayesian isotonic regression and trend analysis.},
Journal = {Biometrics},
Volume = {60},
Number = {2},
Pages = {398-406},
Year = {2004},
Month = {June},
url = {http://dx.doi.org/10.1111/j.0006-341x.2004.00184.x},
Abstract = {In many applications, the mean of a response variable can be
assumed to be a nondecreasing function of a continuous
predictor, controlling for covariates. In such cases,
interest often focuses on estimating the regression
function, while also assessing evidence of an association.
This article proposes a new framework for Bayesian isotonic
regression and order-restricted inference. Approximating the
regression function with a high-dimensional piecewise linear
model, the nondecreasing constraint is incorporated through
a prior distribution for the slopes consisting of a product
mixture of point masses (accounting for flat regions) and
truncated normal densities. To borrow information across the
intervals and smooth the curve, the prior is formulated as a
latent autoregressive normal process. This structure
facilitates efficient posterior computation, since the full
conditional distributions of the parameters have simple
conjugate forms. Point and interval estimates of the
regression function and posterior probabilities of an
association for different regions of the predictor can be
estimated from a single MCMC run. Generalizations to
categorical outcomes and multiple predictors are described,
and the approach is applied to an epidemiology
application.},
Doi = {10.1111/j.0006-341x.2004.00184.x},
Key = {fds257925}
}
@article{fds257926,
Author = {Chen, Z and Dunson, DB},
Title = {Bayesian estimation of survival functions under stochastic
precedence.},
Journal = {Lifetime data analysis},
Volume = {10},
Number = {2},
Pages = {159-173},
Year = {2004},
Month = {June},
url = {http://dx.doi.org/10.1023/b:lida.0000030201.12943.13},
Abstract = {When estimating the distributions of two random variables, X
and Y, investigators often have prior information that Y
tends to be bigger than X. To formalize this prior belief,
one could potentially assume stochastic ordering between X
and Y, which implies Pr(X < or = z) > or = Pr(Y < or = z)
for all z in the domain of X and Y. Stochastic ordering is
quite restrictive, though, and this article focuses instead
on Bayesian estimation of the distribution functions of X
and Y under the weaker stochastic precedence constraint,
Pr(X < or = Y) > or = 0.5. We consider the case where both X
and Y are categorical variables with common support and
develop a Gibbs sampling algorithm for posterior
computation. The method is then generalized to the case
where X and Y are survival times. The proposed approach is
illustrated using data on survival after tumor removal for
patients with malignant melanoma.},
Doi = {10.1023/b:lida.0000030201.12943.13},
Key = {fds257926}
}
@article{fds257927,
Author = {Dunson, DB and Chen, Z},
Title = {Selecting factors predictive of heterogeneity in
multivariate event time data.},
Journal = {Biometrics},
Volume = {60},
Number = {2},
Pages = {352-358},
Year = {2004},
Month = {June},
url = {http://dx.doi.org/10.1111/j.0006-341x.2004.00179.x},
Abstract = {In multivariate survival analysis, investigators are often
interested in testing for heterogeneity among clusters, both
overall and within specific classes. We represent different
hypotheses about the heterogeneity structure using a
sequence of gamma frailty models, ranging from a null model
with no random effects to a full model having random effects
for each class. Following a Bayesian approach, we define
prior distributions for the frailty variances consisting of
mixtures of point masses at zero and inverse-gamma
densities. Since frailties with zero variance effectively
drop out of the model, this prior allocates probability to
each model in the sequence, including the overall null
hypothesis of homogeneity. Using a counting process
formulation, the conditional posterior distributions of the
frailties and proportional hazards regression coefficients
have simple forms. Posterior computation proceeds via a data
augmentation Gibbs sampling algorithm, a single run of which
can be used to obtain model-averaged estimates of the
population parameters and posterior model probabilities for
testing hypotheses about the heterogeneity structure. The
methods are illustrated using data from a lung cancer
trial.},
Doi = {10.1111/j.0006-341x.2004.00179.x},
Key = {fds257927}
}
@article{fds257929,
Author = {Wilcox, AJ and Baird, DD and Dunson, DB and McConnaughey, DR and Kesner,
JS and Weinberg, CR},
Title = {On the frequency of intercourse around ovulation: evidence
for biological influences.},
Journal = {Human reproduction (Oxford, England)},
Volume = {19},
Number = {7},
Pages = {1539-1543},
Year = {2004},
Month = {July},
ISSN = {0268-1161},
url = {http://www.ncbi.nlm.nih.gov/pubmed/15190016},
Abstract = {<h4>Background</h4>Intercourse in mammals is often
coordinated with ovulation, for example through fluctuations
in libido or by the acceleration of ovulation with
intercourse. Such coordination has not been established in
humans. We explored this possibility by examining patterns
of sexual intercourse in relation to ovulation.<h4>Methods</h4>Sixty-eight
sexually active North Carolina women with either an
intrauterine device or tubal ligation provided data for up
to three menstrual cycles. These women collected daily urine
specimens and kept daily diaries of intercourse and
menstrual bleeding. Major estrogen and progesterone
metabolites excreted in urine were used to identify the day
of ovulation. The fertile days of the cycle were defined as
the 6 consecutive days ending with ovulation. Women
contributed a total of 171 ovulatory cycles. Menstrual
bleeding days were excluded from analysis.<h4>Results</h4>The
frequency of intercourse rose during the follicular phase,
peaking at ovulation and declining abruptly thereafter. The
6 consecutive days with most frequent intercourse
corresponded with the 6 fertile days of the menstrual cycle.
Intercourse was 24% more frequent during the 6 fertile days
than during the remaining non-bleeding days (P <
0.001).<h4>Conclusions</h4>There apparently are biological
factors that promote intercourse during a woman's 6 fertile
days.},
Doi = {10.1093/humrep/deh305},
Key = {fds257929}
}
@article{fds257928,
Author = {Slama, R and Ducot, B and Keiding, N and Bouyer, J},
Title = {Studying human fertility and environmental
exposures.},
Journal = {Environmental health perspectives},
Volume = {112},
Number = {11},
Pages = {A604},
Year = {2004},
Month = {August},
ISSN = {0091-6765},
url = {http://dx.doi.org/10.1289/ehp.112-1247502},
Doi = {10.1289/ehp.112-1247502},
Key = {fds257928}
}
@article{fds257930,
Author = {Dunson, DB and Holloman, C and Calder, C and Gunn,
LH},
Title = {Bayesian modeling of multiple lesion onset and growth from
interval-censored data.},
Journal = {Biometrics},
Volume = {60},
Number = {3},
Pages = {676-683},
Year = {2004},
Month = {September},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.0006-341x.2004.00217.x},
Abstract = {In studying rates of occurrence and progression of lesions
(or tumors), it is typically not possible to obtain exact
onset times for each lesion. Instead, data consist of the
number of lesions that reach a detectable size between
screening examinations, along with measures of the
size/severity of individual lesions at each exam time. This
interval-censored data structure makes it difficult to
properly adjust for the onset time distribution in assessing
covariate effects on rates of lesion progression. This
article proposes a joint model for the multiple lesion onset
and progression process, motivated by cross-sectional data
from a study of uterine leiomyoma tumors. By using a joint
model, one can potentially obtain more precise inferences on
rates of onset, while also performing onset time-adjusted
inferences on lesion severity. Following a Bayesian
approach, we propose a data augmentation Markov chain Monte
Carlo algorithm for posterior computation.},
Doi = {10.1111/j.0006-341x.2004.00217.x},
Key = {fds257930}
}
@article{fds257931,
Author = {O'Brien, SM and Dunson, DB},
Title = {Bayesian multivariate logistic regression.},
Journal = {Biometrics},
Volume = {60},
Number = {3},
Pages = {739-746},
Year = {2004},
Month = {September},
ISSN = {0006-341X},
url = {http://www.ncbi.nlm.nih.gov/pubmed/15339297},
Abstract = {Bayesian analyses of multivariate binary or categorical
outcomes typically rely on probit or mixed effects logistic
regression models that do not have a marginal logistic
structure for the individual outcomes. In addition,
difficulties arise when simple noninformative priors are
chosen for the covariance parameters. Motivated by these
problems, we propose a new type of multivariate logistic
distribution that can be used to construct a likelihood for
multivariate logistic regression analysis of binary and
categorical data. The model for individual outcomes has a
marginal logistic structure, simplifying interpretation. We
follow a Bayesian approach to estimation and inference,
developing an efficient data augmentation algorithm for
posterior computation. The method is illustrated with
application to a neurotoxicology study.},
Doi = {10.1111/j.0006-341X.2004.00224.x},
Key = {fds257931}
}
@article{fds257932,
Author = {Herring, AH and Dunson, DB and Dole, N},
Title = {Modeling the effects of a bidirectional latent predictor
from multivariate questionnaire data.},
Journal = {Biometrics},
Volume = {60},
Number = {4},
Pages = {926-935},
Year = {2004},
Month = {December},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.0006-341x.2004.00248.x},
Abstract = {Researchers often measure stress using questionnaire data on
the occurrence of potentially stress-inducing life events
and the strength of reaction to these events, characterized
as negative or positive and assigned an ordinal ranking. In
studying the health effects of stress, one needs to obtain
measures of an individual's negative and positive stress
levels to be used as predictors. Motivated by data of this
type, we propose a latent variable model, which is
characterized by event-specific negative and positive
reaction scores. If the positive reaction score dominates
the negative reaction score for an event, then the
individual's reported response to that event will be
positive, with an ordinal ranking determined by the value of
the score. Measures of overall positive and negative stress
can be obtained by summing the reactivity scores across the
events that occur for an individual. By incorporating these
measures as predictors in a regression model and fitting the
stress and outcome models jointly using Bayesian methods,
inferences can be conducted without the need to assume known
weights for the different events. We propose an MCMC
algorithm for posterior computation and apply the approach
to study the effects of stress on preterm
delivery.},
Doi = {10.1111/j.0006-341x.2004.00248.x},
Key = {fds257932}
}
@article{fds257934,
Author = {Dunson, DB and Herring, AH},
Title = {Bayesian latent variable models for mixed discrete
outcomes.},
Journal = {Biostatistics (Oxford, England)},
Volume = {6},
Number = {1},
Pages = {11-25},
Year = {2005},
Month = {January},
url = {http://dx.doi.org/10.1093/biostatistics/kxh025},
Abstract = {In studies of complex health conditions, mixtures of
discrete outcomes (event time, count, binary, ordered
categorical) are commonly collected. For example, studies of
skin tumorigenesis record latency time prior to the first
tumor, increases in the number of tumors at each week, and
the occurrence of internal tumors at the time of death.
Motivated by this application, we propose a general
underlying Poisson variable framework for mixed discrete
outcomes, accommodating dependency through an additive gamma
frailty model for the Poisson means. The model has
log-linear, complementary log-log, and proportional hazards
forms for count, binary and discrete event time outcomes,
respectively. Simple closed form expressions can be derived
for the marginal expectations, variances, and correlations.
Following a Bayesian approach to inference,
conditionally-conjugate prior distributions are chosen that
facilitate posterior computation via an MCMC algorithm. The
methods are illustrated using data from a Tg.AC mouse
bioassay study.},
Doi = {10.1093/biostatistics/kxh025},
Key = {fds257934}
}
@article{fds257935,
Author = {Longnecker, MP and Klebanoff, MA and Dunson, DB and Guo, X and Chen, Z and Zhou, H and Brock, JW},
Title = {Maternal serum level of the DDT metabolite DDE in relation
to fetal loss in previous pregnancies.},
Journal = {Environmental research},
Volume = {97},
Number = {2},
Pages = {127-133},
Year = {2005},
Month = {February},
ISSN = {0013-9351},
url = {http://dx.doi.org/10.1016/s0013-9351(03)00108-7},
Abstract = {Use of 1,1,1-trichloro-2,2-bis(p-chlorophenyl)ethane (DDT)
continues in about 25 countries. This use has been justified
partly by the belief that it has no adverse consequences on
human health. Evidence has been increasing, however, for
adverse reproductive effects of DDT, but additional data are
needed. Pregnant women who enrolled in the Collaborative
Perinatal Project (United States, 1959-1965) were asked
about their previous pregnancy history; blood samples were
drawn and the serum frozen. In 1997-1999, the sera of 1717
of these women who had previous pregnancies were analyzed
for 1,1-dichloro-2,2-bis(p-chlorophenyl)ethylene (DDE), the
major breakdown product of DDT. The odds of previous fetal
loss was examined in relation to DDE level in logistic
regression models. Compared with women whose DDE level was
<15 microg/L, the adjusted odds ratios of fetal loss
according to category of DDE were as follows: 15-29
microg/L, 1.1; 30-44 microg/L, 1.4; 45-59 microg/L, 1.6; and
60+ microg/L, 1.2. The adjusted odds ratio per 60 microg/L
increase was 1.4 (95% confidence interval 1.1-1.6). The
results were consistent with an adverse effect of DDE on
fetal loss, but were inconclusive owing to the possibility
that previous pregnancies ending in fetal loss decreased
serum DDE levels less than did those carried to
term.},
Doi = {10.1016/s0013-9351(03)00108-7},
Key = {fds257935}
}
@article{fds257936,
Author = {Dunson, DB and Stanford, JB},
Title = {Bayesian inferences on predictors of conception
probabilities.},
Journal = {Biometrics},
Volume = {61},
Number = {1},
Pages = {126-133},
Year = {2005},
Month = {March},
url = {http://dx.doi.org/10.1111/j.0006-341x.2005.031231.x},
Abstract = {Reproductive scientists and couples attempting pregnancy are
interested in identifying predictors of the day-specific
probabilities of conception in relation to the timing of a
single intercourse act. Because most menstrual cycles have
multiple days of intercourse, the occurrence of conception
represents the aggregation across Bernoulli trials for each
intercourse day. Because of this data structure and
dependency among the multiple cycles from a woman,
implementing analyses has proven challenging. This article
proposes a Bayesian approach based on a generalization of
the Barrett and Marshall model to incorporate a
woman-specific frailty and day-specific covariates. The
model results in a simple closed form expression for the
marginal probability of conception, and has an auxiliary
variables formulation that facilitates efficient posterior
computation. Although motivated by fecundability studies,
the approach can be used for efficient variable selection
and model averaging in general applications with categorical
or discrete event time data.},
Doi = {10.1111/j.0006-341x.2005.031231.x},
Key = {fds257936}
}
@article{fds257938,
Author = {Dunson, DB and Taylor, JA},
Title = {Approximate Bayesian inference for quantites},
Journal = {Journal of Nonparametric Statistics},
Volume = {17},
Number = {3},
Pages = {385-400},
Publisher = {Informa UK Limited},
Year = {2005},
Month = {April},
url = {http://dx.doi.org/10.1080/10485250500039049},
Abstract = {Suppose data consist of a random sample from a distribution
function F Y, which is unknown, and that interest focuses on
inferences on θ, a vector of quantiles of FY. When the
likelihood function is not fully specified, a posterior
density cannot be calculated and Bayesian inference is
difficult. This article considers an approach which relies
on a substitution likelihood characterized by a vector of
quantiles. Properties of the substitution likelihood are
investigated, strategies for prior elicitation are
presented, and a general framework is proposed for quantile
regression modeling. Posterior computation proceeds via a
Metropolis algorithm that utilizes a normal approximation to
the posterior. Results from a simulation study are
presented, and the methods are illustrated through
application to data from a genotoxicity experiment. © 2005
Taylor & Francis Ltd.},
Doi = {10.1080/10485250500039049},
Key = {fds257938}
}
@article{fds258074,
Author = {Dunson, DB and Bigelow, JL and Colombo, B},
Title = {Reduced fertilization rates in older men when cervical mucus
is suboptimal.},
Journal = {Obstetrics and gynecology},
Volume = {105},
Number = {4},
Pages = {788-793},
Year = {2005},
Month = {April},
ISSN = {0029-7844},
url = {http://dx.doi.org/10.1097/01.aog.0000154155.20366.ee},
Abstract = {<h4>Objective</h4>Cervical mucus is vital in the regulation
of sperm survival and transport through the reproductive
tract. The goal of this study is to assess whether the
lowered fertility for men in their late 30s and early 40s is
related to the nature of cervical mucus on the day of
intercourse.<h4>Methods</h4>In a prospective study of 7
European family planning centers, 782 couples not using
birth control recorded daily observations of intercourse and
the nature of cervical mucus. Using data from 1,459
menstrual cycles, 342 ending in pregnancy, we estimate
day-specific conception probabilities in relation to mucus
and male and female age.<h4>Results</h4>On days where
cervical mucus was not evident, intercourse for men in their
late 30s and early 40s was 50% less likely to result in a
clinical pregnancy, adjusting for intercourse timing and
female age. As secretions become more conducive to sperm
transport, the effect of male age diminishes steadily from
21% on days with damp secretions, to 11% on days with thick
mucus, to only 4% on days with most fertile-type
mucus.<h4>Conclusion</h4>The effect of male age on
fecundability can be minimized by timing intercourse on days
with optimal secretions.<h4>Level of evidence</h4>II-2.},
Doi = {10.1097/01.aog.0000154155.20366.ee},
Key = {fds258074}
}
@article{fds257937,
Author = {Dunson, DB and Herring, AH},
Title = {Bayesian model selection and averaging in additive and
proportional hazards models.},
Journal = {Lifetime data analysis},
Volume = {11},
Number = {2},
Pages = {213-232},
Year = {2005},
Month = {June},
url = {http://dx.doi.org/10.1007/s10985-004-0384-x},
Abstract = {Although Cox proportional hazards regression is the default
analysis for time to event data, there is typically
uncertainty about whether the effects of a predictor are
more appropriately characterized by a multiplicative or
additive model. To accommodate this uncertainty, we place a
model selection prior on the coefficients in an
additive-multiplicative hazards model. This prior assigns
positive probability, not only to the model that has both
additive and multiplicative effects for each predictor, but
also to sub-models corresponding to no association, to only
additive effects, and to only proportional effects. The
additive component of the model is constrained to ensure
non-negative hazards, a condition often violated by current
methods. After augmenting the data with Poisson latent
variables, the prior is conditionally conjugate, and
posterior computation can proceed via an efficient Gibbs
sampling algorithm. Simulation study results are presented,
and the methodology is illustrated using data from the
Framingham heart study.},
Doi = {10.1007/s10985-004-0384-x},
Key = {fds257937}
}
@article{fds258075,
Author = {Dunson, DB},
Title = {Bayesian semiparametric isotonic regression for count
data},
Journal = {Journal of the American Statistical Association},
Volume = {100},
Number = {470},
Pages = {618-627},
Publisher = {Informa UK Limited},
Year = {2005},
Month = {June},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/016214504000001457},
Abstract = {This article proposes a semiparametric Bayesian approach for
inference on an unknown isotonic regression function, f(x),
characterizing the relationship between a continuous
predictor, X, and a count response variable, Y, adjusting
for covariates, Z. A Dirichlet process mixture of Poisson
distributions is used to avoid parametric assumptions on the
conditional distribution of Y given X and Z. Then, to also
avoid parametric assumptions on f(x), a novel prior
formulation is proposed that enforces the nondecreasing
constraint and assigns positive prior probability to the
null hypothesis of no association. Through the use of
carefully tailored hyperprior distributions, we allow for
borrowing of information across different regions of X in
estimating f(x) and in assessing hypotheses about local
increases in the function. Due to conjugacy properties,
posterior computation is straightforward using a Markov
chain Monte Carlo algorithm. The methods are illustrated
using data from an epidemiologic study of sleep problems and
obesity.},
Doi = {10.1198/016214504000001457},
Key = {fds258075}
}
@article{fds257939,
Author = {Gunn, LH and Dunson, DB},
Title = {A transformation approach for incorporating monotone or
unimodal constraints.},
Journal = {Biostatistics (Oxford, England)},
Volume = {6},
Number = {3},
Pages = {434-449},
Year = {2005},
Month = {July},
ISSN = {1465-4644},
url = {http://dx.doi.org/10.1093/biostatistics/kxi020},
Abstract = {Samples of curves are collected in many applications,
including studies of reproductive hormone levels in the
menstrual cycle. Many approaches have been proposed for
correlated functional data of this type, including smoothing
spline methods and other flexible parametric modeling
strategies. In many cases, the underlying biological
processes involved restrict the curve to follow a particular
shape. For example, progesterone levels in healthy women
increase during the menstrual cycle to a peak achieved at
random location with decreases thereafter. Reproductive
epidemiologists are interested in studying the distribution
of the peak and the trajectory for women in different
groups. Motivated by this application, we propose a simple
approach for restricting each woman's mean trajectory to
follow an umbrella shape. An unconstrained hierarchical
Bayesian model is used to characterize the data, and draws
from the posterior distribution obtained using a Gibbs
sampler are then mapped to the constrained space. Inferences
are based on the resulting quasi-posterior distribution for
the peak and individual woman trajectories. The methods are
applied to a study comparing progesterone trajectories for
conception and nonconception cycles.},
Doi = {10.1093/biostatistics/kxi020},
Key = {fds257939}
}
@article{fds257940,
Author = {Law, DCG and Klebanoff, MA and Brock, JW and Dunson, DB and Longnecker,
MP},
Title = {Maternal serum levels of polychlorinated biphenyls and
1,1-dichloro-2,2-bis(p-chlorophenyl)ethylene (DDE) and time
to pregnancy.},
Journal = {American journal of epidemiology},
Volume = {162},
Number = {6},
Pages = {523-532},
Year = {2005},
Month = {September},
url = {http://dx.doi.org/10.1093/aje/kwi240},
Abstract = {Polychlorinated biphenyls (PCBs), once used widely in
transformers and other applications, and
1,1-dichloro-2,2-bis(p-chlorophenyl)ethylene (DDE), the main
metabolite of the pesticide 1,1,1-trichloro-2,2-bis(p-chlorophenyl)ethane
(DDT), are hormonally active agents. Changes in menstrual
cycle functioning associated with PCBs and DDE, and
increased odds of spontaneous abortion associated with DDE,
suggest that these compounds could affect fertility. The
authors investigated the association between PCB and DDE
exposure and time to pregnancy by using serum levels
measured in 390 pregnant women in the Collaborative
Perinatal Project enrolled at 12 study centers in the United
States from 1959 to 1965. They estimated adjusted
fecundability odds ratios by using Cox proportional hazards
modeling for discrete time data. Compared with time to
pregnancy for women in the lowest exposure category (PCBs <
1.24 microg/liter, DDE < 14 microg/liter), time to pregnancy
increased for women in the highest exposure category in
terms of both PCBs (fecundability odds ratio for PCBs > or =
5.00 microg/liter = 0.65, 95% confidence interval: 0.36,
1.18) and DDE (fecundability odds ratio for DDE > or = 60
microg/liter = 0.65, 95% confidence interval: 0.32, 1.31).
Overall, time to pregnancy increased with increasing serum
PCB levels but was less suggestive of an association with
DDE. Both trends were imprecise and attenuated when
expressed on a lipid basis. Overall, evidence of an
association between PCB or DDE exposure and time to
pregnancy was weak and inconclusive.},
Doi = {10.1093/aje/kwi240},
Key = {fds257940}
}
@article{fds257941,
Author = {Peddada, SD and Dunson, DB and Tan, X},
Title = {Estimation of order-restricted means from correlated
data},
Journal = {Biometrika},
Volume = {92},
Number = {3},
Pages = {703-715},
Publisher = {Oxford University Press (OUP)},
Year = {2005},
Month = {September},
url = {http://dx.doi.org/10.1093/biomet/92.3.703},
Abstract = {In many applications, researchers are interested in
estimating the mean of a multivariate normal random vector
whose components are subject to order restrictions. Various
authors have demonstrated that the likelihood-based
methodology may perform poorly under certain conditions for
such problems. The problem is much harder when the
underlying covariance matrix is nondiagonal. In this paper a
simple iterative algorithm is introduced that can be used
for estimating the mean of a multivariate normal population
when the components are subject to any order restriction.
The proposed methodology is illustrated through an
application to human reproductive hormone data. © 2005
Biometrika Trust.},
Doi = {10.1093/biomet/92.3.703},
Key = {fds257941}
}
@article{fds258016,
Author = {Gueorguieva, RV},
Title = {Comments about Joint Modeling of Cluster Size and Binary and
Continuous Subunit-Specific Outcomes.},
Journal = {Biometrics},
Volume = {61},
Number = {3},
Pages = {862-866},
Year = {2005},
Month = {September},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-020x.2005.00409_1.x},
Abstract = {In longitudinal studies and in clustered situations often
binary and continuous response variables are observed and
need to be modeled together. In a recent publication Dunson,
Chen, and Harry (2003, Biometrics 59, 521-530) (DCH) propose
a Bayesian approach for joint modeling of cluster size and
binary and continuous subunit-specific outcomes and
illustrate this approach with a developmental toxicity data
example. In this note we demonstrate how standard software
(PROC NLMIXED in SAS) can be used to obtain maximum
likelihood estimates in an alternative parameterization of
the model with a single cluster-level factor considered by
DCH for that example. We also suggest that a more general
model with additional cluster-level random effects provides
a better fit to the data set. An apparent discrepancy
between the estimates obtained by DCH and the estimates
obtained earlier by Catalano and Ryan (1992, Journal of the
American Statistical Association 87, 651-658) is also
resolved. The issue of bias in inferences concerning the
dose effect when cluster size is ignored is discussed. The
maximum-likelihood approach considered herein is applicable
to general situations with multiple clustered or
longitudinally measured outcomes of different type and does
not require prior specification and extensive
programming.},
Doi = {10.1111/j.1541-020x.2005.00409_1.x},
Key = {fds258016}
}
@article{fds257933,
Author = {Dunson, DB},
Title = {Bayesian Biostatistics},
Journal = {Handbook of Statistics},
Volume = {25},
Pages = {743-761},
Publisher = {Elsevier},
Year = {2005},
Month = {December},
ISSN = {0169-7161},
url = {http://dx.doi.org/10.1016/S0169-7161(05)25025-3},
Abstract = {With the rapid increase in biomedical technology and the
accompanying generation of complex and high-dimensional data
sets, Bayesian statistical methods have become much more
widely used. One reason is that the Bayesian probability
modeling machinery provides a natural framework for
integration of data and information from multiple sources,
while accounting for uncertainty in model specifications.
This chapter briefly reviews some of the recent areas in
which Bayesian biostatistical research has had the greatest
impact. Particular areas of focus include correlated and
longitudinal data analysis, event time data, nonlinear
modeling, model averaging, and bioinformatics. The reader is
referred elsewhere for recent Bayesian developments in other
important areas, such as clinical trials and analysis of
spatially correlated data. Certainly the many practical and
conceptual advantages of the Bayesian paradigm will lead to
an increasing impact in future biomedical research,
particularly in areas such as genomics. © 2005 Elsevier
B.V. All rights reserved.},
Doi = {10.1016/S0169-7161(05)25025-3},
Key = {fds257933}
}
@article{fds257942,
Author = {Hans, C and Dunson, DB},
Title = {Bayesian inferences on umbrella orderings.},
Journal = {Biometrics},
Volume = {61},
Number = {4},
Pages = {1018-1026},
Year = {2005},
Month = {December},
ISSN = {0006-341X},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16401275},
Abstract = {In regression applications with categorical predictors,
interest often focuses on comparing the null hypothesis of
homogeneity to an ordered alternative. This article proposes
a Bayesian approach for addressing this problem in the
setting of normal linear and probit regression models. The
regression coefficients are assigned a conditionally
conjugate prior density consisting of mixtures of point
masses at 0 and truncated normal densities, with a (possibly
unknown) changepoint parameter included to accommodate
umbrella ordering. Two strategies of prior elicitation are
considered: (1) a Bayesian Bonferroni approach in which the
probability of the global null hypothesis is specified and
local hypotheses are considered independent; and (2) an
approach which treats these probabilities as random. A
single Gibbs sampling chain can be used to obtain posterior
probabilities for the different hypotheses and to estimate
regression coefficients and predictive quantities either by
model averaging or under the preferred hypothesis. The
methods are applied to data from a carcinogenesis
study.},
Doi = {10.1111/j.1541-0420.2005.00373.x},
Key = {fds257942}
}
@article{fds258019,
Author = {Chen, Z and Dunson, DB},
Title = {The authors replied as follows [2]},
Journal = {Biometrics},
Volume = {62},
Number = {2},
Pages = {623-624},
Publisher = {WILEY},
Year = {2006},
Month = {January},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-0420.2006.00586_2.x},
Doi = {10.1111/j.1541-0420.2006.00586_2.x},
Key = {fds258019}
}
@article{fds257943,
Author = {Baird, DD and Kesner, JS and Dunson, DB},
Title = {Luteinizing hormone in premenopausal women may stimulate
uterine leiomyomata development.},
Journal = {Journal of the Society for Gynecologic Investigation},
Volume = {13},
Number = {2},
Pages = {130-135},
Year = {2006},
Month = {February},
ISSN = {1071-5576},
url = {http://dx.doi.org/10.1016/j.jsgi.2005.12.001},
Abstract = {<h4>Objective</h4>Human chorionic gonadotropin (hCG) has
proliferative effects on uterine smooth muscle and leiomyoma
tissue in vitro. We hypothesized that luteinizing hormone
(LH) would have the same effect by activating the LH/hCG
receptor, and it would follow that premenopausal women with
higher basal LH levels would be more likely to have
leiomyomata.<h4>Methods</h4>Randomly selected women, aged 35
to 49 years, from a prepaid health plan were screened for
leiomyomata with pelvic ultrasound. Urine samples collected
during the first or last 5 days of the menstrual cycle were
analyzed for LH by immunofluorometric assay, and
concentrations were corrected for creatinine (n = 523).
Logistic regression and Bayes analyses were used to evaluate
the association of LH with presence and size of leiomyomata,
adjusting for age, and other risk factors.<h4>Results</h4>Women
with higher LH were more likely to have leiomyomata
(adjusted odds ratios for second and third tertiles were 1.7
and 2.0 compared with lower tertile; 95% confidence
intervals, 1.0 to 2.7 and 1.2 to 3.4, respectively). The
association was stronger for large leiomyomata. Bayes
analyses designed to estimate LH effects on tumor onset
separately from tumor growth showed significantly
accelerated tumor onset but little evidence of effects on
tumor growth. Age, an independent risk factor for
leiomyomata, was not affected by inclusion of LH in the
logistic models.<h4>Conclusions</h4>As hypothesized, women
with higher LH were more likely to have leiomyomata, but
this did not explain the age-related increase in leiomyomata
during perimenopausal ages. Determining whether LH is causal
or a marker for susceptibility will require further
research.},
Doi = {10.1016/j.jsgi.2005.12.001},
Key = {fds257943}
}
@article{fds258017,
Author = {Scarpa, B and Dunson, DB and Colombo, B},
Title = {Cervical mucus secretions on the day of intercourse: an
accurate marker of highly fertile days.},
Journal = {European journal of obstetrics, gynecology, and reproductive
biology},
Volume = {125},
Number = {1},
Pages = {72-78},
Year = {2006},
Month = {March},
url = {http://dx.doi.org/10.1016/j.ejogrb.2005.07.024},
Abstract = {<h4>Objective</h4>To provide estimates of the probabilities
of conception according to vulvar mucus observations
classified by the woman on the day of intercourse.<h4>Study
design</h4>Prospective cohort study of 193 outwardly healthy
Italian women using the Billings Ovulation Method. Outcome
measures include 161 conception cycles and 2594
non-conception cycles with daily records of the type of
mucus and the occurrences of sexual intercourse.<h4>Results</h4>The
probability of conception ranged from 0.003 for days with no
noticeable secretions to 0.29 for days with most
fertile-type mucus detected by the woman. The probability of
most fertile type mucus by day of the menstrual cycle
increased from values <20% outside of days 10-17 to a peak
of 59% on day 13.<h4>Conclusion</h4>Regardless of the timing
of intercourse in the menstrual cycle, the probability of
conception is essentially 0 on days with no secretions. This
probability increases dramatically to near 30% on days with
most fertile-type mucus, an association that accurately
predicts both the timing of the fertile interval and the
day-specific conception probabilities across the menstrual
cycle.},
Doi = {10.1016/j.ejogrb.2005.07.024},
Key = {fds258017}
}
@article{fds258018,
Author = {Dunson, DB},
Title = {Special issue of statistical methods in medical research on
reproductive studies},
Journal = {Statistical Methods in Medical Research},
Volume = {15},
Number = {2},
Pages = {91-92},
Publisher = {SAGE Publications},
Year = {2006},
Month = {April},
ISSN = {0962-2802},
url = {http://dx.doi.org/10.1191/0962280206sm432ed},
Doi = {10.1191/0962280206sm432ed},
Key = {fds258018}
}
@article{fds258070,
Author = {Cai, B and Dunson, DB},
Title = {Bayesian covariance selection in generalized linear mixed
models.},
Journal = {Biometrics},
Volume = {62},
Number = {2},
Pages = {446-457},
Year = {2006},
Month = {June},
ISSN = {0006-341X},
url = {http://ftp.stat.duke.edu/WorkingPapers/05-01.html},
Abstract = {The generalized linear mixed model (GLMM), which extends the
generalized linear model (GLM) to incorporate random effects
characterizing heterogeneity among subjects, is widely used
in analyzing correlated and longitudinal data. Although
there is often interest in identifying the subset of
predictors that have random effects, random effects
selection can be challenging, particularly when outcome
distributions are nonnormal. This article proposes a fully
Bayesian approach to the problem of simultaneous selection
of fixed and random effects in GLMMs. Integrating out the
random effects induces a covariance structure on the
multivariate outcome data, and an important problem that we
also consider is that of covariance selection. Our approach
relies on variable selection-type mixture priors for the
components in a special Cholesky decomposition of the random
effects covariance. A stochastic search MCMC algorithm is
developed, which relies on Gibbs sampling, with Taylor
series expansions used to approximate intractable integrals.
Simulated data examples are presented for different
exponential family distributions, and the approach is
applied to discrete survival data from a time-to-pregnancy
study.},
Doi = {10.1111/j.1541-0420.2005.00499.x},
Key = {fds258070}
}
@article{fds258020,
Author = {O'Brien, SM and Kupper, LL and Dunson, DB},
Title = {Performance of tests of association in misspecified
generalized linear models},
Journal = {Journal of Statistical Planning and Inference},
Volume = {136},
Number = {9},
Pages = {3090-3100},
Publisher = {Elsevier BV},
Year = {2006},
Month = {September},
ISSN = {0378-3758},
url = {http://dx.doi.org/10.1016/j.jspi.2004.12.004},
Abstract = {We examine the effects of modelling errors, such as
underfitting and overfitting, on the asymptotic power of
tests of association between an explanatory variable x and
an outcome in the setting of generalized linear models. The
regression function for x is approximated by a polynomial or
another simple function, and a chi-square statistic is used
to test whether the coefficients of the approximation are
simultaneously equal to zero. Adding terms to the
approximation increases asymptotic power if and only if the
fit of the model increases by a certain quantifiable amount.
Although a high degree of freedom approximation offers
robustness to the shape of the unknown regression function,
a low degree of freedom approximation can yield much higher
asymptotic power even when the approximation is very poor.
In practice, it is useful to compute the power of competing
test statistics across the range of alternatives that are
plausible a priori. This approach is illustrated through an
application in epidemiology. © 2006 Elsevier B.V. All
rights reserved.},
Doi = {10.1016/j.jspi.2004.12.004},
Key = {fds258020}
}
@article{fds258073,
Author = {Dunson, DB},
Title = {Bayesian dynamic modeling of latent trait
distributions.},
Journal = {Biostatistics (Oxford, England)},
Volume = {7},
Number = {4},
Pages = {551-568},
Year = {2006},
Month = {October},
ISSN = {1465-4644},
url = {http://dx.doi.org/10.1093/biostatistics/kxj025},
Abstract = {Studies of latent traits often collect data for multiple
items measuring different aspects of the trait. For such
data, it is common to consider models in which the different
items are manifestations of a normal latent variable, which
depends on covariates through a linear regression model.
This article proposes a flexible Bayesian alternative in
which the unknown latent variable density can change
dynamically in location and shape across levels of a
predictor. Scale mixtures of underlying normals are used in
order to model flexibly the measurement errors and allow
mixed categorical and continuous scales. A dynamic mixture
of Dirichlet processes is used to characterize the latent
response distributions. Posterior computation proceeds via a
Markov chain Monte Carlo algorithm, with predictive
densities used as a basis for inferences and evaluation of
model fit. The methods are illustrated using data from a
study of DNA damage in response to oxidative
stress.},
Doi = {10.1093/biostatistics/kxj025},
Key = {fds258073}
}
@article{fds258021,
Author = {Stanford, JB and Dunson, DB},
Title = {Foreword. Expanding Methodologies for Capturing Day-Specific
Probabilities of Conception.},
Journal = {Paediatric and perinatal epidemiology},
Volume = {20 Suppl 1},
Pages = {1-2},
Year = {2006},
Month = {November},
ISSN = {0269-5022},
url = {http://dx.doi.org/10.1111/j.1365-3016.2006.00764.x},
Doi = {10.1111/j.1365-3016.2006.00764.x},
Key = {fds258021}
}
@article{fds258023,
Author = {Scarpa, B and Dunson, DB},
Title = {Bayesian selection of predictors of conception probabilities
across the menstrual cycle.},
Journal = {Paediatric and perinatal epidemiology},
Volume = {20 Suppl 1},
Number = {SUPPL. 1},
Pages = {30-37},
Year = {2006},
Month = {November},
ISSN = {0269-5022},
url = {http://dx.doi.org/10.1111/j.1365-3016.2006.00768.x},
Abstract = {There is increasing interest in identifying predictors of
human fertility, including environmental exposures,
behavioural factors, and biomarkers, such as mucus or
reproductive hormones. Epidemiological studies typically
measure fecundability, the per menstrual cycle probability
of conception, using time to pregnancy data. A critical
predictor, which is often ignored in the design or analysis,
is the timing of non-contracepting intercourse in the
menstrual cycle. In order to limit confounding by
behavioural differences between exposure groups, it may be
preferable to base inferences on day-specific conception
probabilities in relation to intercourse timing. This
article proposes Bayesian methods for selection of
predictors of day-specific conception probabilities. A
particular focus is the case in which data on ovulation
timing are not available. We focus on the selection of
fertile days in the cycle during which conception
probabilities are non-negligible and predictors may play a
role. Data from recent European and Italian prospective
studies of daily fecundability are presented, and the
proposed approach is used to estimate cervical mucus effects
within a mid-cycle potentially fertile window using data
from the Italian study.},
Doi = {10.1111/j.1365-3016.2006.00768.x},
Key = {fds258023}
}
@article{fds258069,
Author = {Pennell, ML and Dunson, DB},
Title = {Bayesian semiparametric dynamic frailty models for multiple
event time data.},
Journal = {Biometrics},
Volume = {62},
Number = {4},
Pages = {1044-1052},
Year = {2006},
Month = {December},
ISSN = {0006-341X},
url = {http://ftp.stat.duke.edu/WorkingPapers/04-27.html},
Abstract = {Many biomedical studies collect data on times of occurrence
for a health event that can occur repeatedly, such as
infection, hospitalization, recurrence of disease, or tumor
onset. To analyze such data, it is necessary to account for
within-subject dependency in the multiple event times.
Motivated by data from studies of palpable tumors, this
article proposes a dynamic frailty model and Bayesian
semiparametric approach to inference. The widely used shared
frailty proportional hazards model is generalized to allow
subject-specific frailties to change dynamically with age
while also accommodating nonproportional hazards. Parametric
assumptions on the frailty distribution are avoided by using
Dirichlet process priors for a shared frailty and for
multiplicative innovations on this frailty. By centering the
semiparametric model on a conditionally conjugate dynamic
gamma model, we facilitate posterior computation and
lack-of-fit assessments of the parametric model. Our
proposed method is demonstrated using data from a cancer
chemoprevention study.},
Doi = {10.1111/j.1541-0420.2006.00571.x},
Key = {fds258069}
}
@article{fds258067,
Author = {Kinney, S and Dunson, DB},
Title = {Fixed and random effects selection in linear and logistic
models},
Journal = {Biometrics},
Volume = {63},
Number = {3},
Pages = {690-698},
Year = {2007},
ISSN = {0006-341X},
url = {http://www.ncbi.nlm.nih.gov/pubmed/17403104},
Abstract = {We address the problem of selecting which variables should
be included in the fixed and random components of logistic
mixed effects models for correlated data. A fully Bayesian
variable selection is implemented using a stochastic search
Gibbs sampler to estimate the exact model-averaged posterior
distribution. This approach automatically identifies subsets
of predictors having nonzero fixed effect coefficients or
nonzero random effects variance, while allowing uncertainty
in the model selection process. Default priors are proposed
for the variance components and an efficient parameter
expansion Gibbs sampler is developed for posterior
computation. The approach is illustrated using simulated
data and an epidemiologic example.},
Doi = {10.1111/j.1541-0420.2007.00771.x},
Key = {fds258067}
}
@article{fds257944,
Author = {Baird, DD and Dunson, DB and Hill, MC and Cousins, D and Schectman,
JM},
Title = {Association of physical activity with development of uterine
leiomyoma.},
Journal = {American journal of epidemiology},
Volume = {165},
Number = {2},
Pages = {157-163},
Year = {2007},
Month = {January},
ISSN = {0002-9262},
url = {http://dx.doi.org/10.1093/aje/kwj363},
Abstract = {The relation between physical activity and uterine
leiomyomata (fibroids) has received little study, but
exercise is protective for breast cancer, another hormonally
mediated tumor. Participants in this study were randomly
selected members of a health plan based in Washington, DC,
aged 35-49 years (734 African Americans, 455 Whites)
enrolled between 1996 and 1999. Fibroid status was based on
ultrasound screening. Physical activity was based on
detailed interview questions. Logistic regression with
adjustment for body mass index and other risk factors showed
that women in the highest category of physical activity were
significantly less likely to have fibroids (odds ratio =
0.6, 95% confidence interval = 0.4, 0.9 for the highest vs.
the lowest category (equivalent to approximately > or =7
hours/week vs <2 hours/week)). There was a dose-response
pattern; a significant trend was seen for both
African-American and White women. A multistate Bayesian
analysis indicated that exercise was associated with tumor
onset more strongly than with tumor growth. When data for
women who reported major fibroid-related symptoms were
excluded, results remained essentially unchanged, suggesting
that the observed association could not be attributed to
reverse causation (fibroids preventing exercise). The
authors concluded that regular exercise might help women
prevent fibroids.},
Doi = {10.1093/aje/kwj363},
Key = {fds257944}
}
@article{fds257945,
Author = {MacLehose, RF and Dunson, DB and Herring, AH and Hoppin,
JA},
Title = {Bayesian methods for highly correlated exposure
data.},
Journal = {Epidemiology (Cambridge, Mass.)},
Volume = {18},
Number = {2},
Pages = {199-207},
Year = {2007},
Month = {March},
ISSN = {1044-3983},
url = {http://dx.doi.org/10.1097/01.ede.0000256320.30737.c0},
Abstract = {Studies that include individuals with multiple highly
correlated exposures are common in epidemiology. Because
standard maximum likelihood techniques often fail to
converge in such instances, hierarchical regression methods
have seen increasing use. Bayesian hierarchical regression
places prior distributions on exposure-specific regression
coefficients to stabilize estimation and incorporate prior
knowledge, if available. A common parametric approach in
epidemiology is to treat the prior mean and variance as
fixed constants. An alternative parametric approach is to
place distributions on the prior mean and variance to allow
the data to help inform their values. As a more flexible
semiparametric option, one can place an unknown distribution
on the coefficients that simultaneously clusters exposures
into groups using a Dirichlet process prior. We also present
a semiparametric model with a variable-selection prior to
allow clustering of coefficients at 0. We compare these 4
hierarchical regression methods and demonstrate their
application in an example estimating the association of
herbicides with retinal degeneration among wives of
pesticide applicators.},
Doi = {10.1097/01.ede.0000256320.30737.c0},
Key = {fds257945}
}
@article{fds257946,
Author = {Scarpa, B and Dunson, DB},
Title = {Bayesian methods for searching for optimal rules for timing
intercourse to achieve pregnancy.},
Journal = {Statistics in medicine},
Volume = {26},
Number = {9},
Pages = {1920-1936},
Year = {2007},
Month = {April},
ISSN = {0277-6715},
url = {http://dx.doi.org/10.1002/sim.2846},
Abstract = {With societal trends towards increasing age at starting a
pregnancy attempt, many women are concerned about achieving
conception before the onset of infertility, which precedes
menopause. Couples failing to conceive a pregnancy within 12
months are classified as clinically infertile, and may be
recommended for assisted reproductive therapy (ART). Because
many ART procedures are expensive and may convey an
increased risk of adverse outcomes for the offspring, it is
advantageous to decrease time to pregnancy by natural
methods. One possibility is to intentionally time
intercourse during the days of the menstrual cycle having
the highest conception probabilities. This article proposes
a Bayesian decision theoretic approach for searching for
optimal rules for timing intercourse based on cycle day,
secretions and other information. Good rules result in high
conception probabilities while requiring minimal targeted
intercourse. A biologically based statistical model is used
to relate cycle day and biomarkers to the conception
probability. A stochastic search procedure is then developed
to search for rules with high expected utility, and the
methods are applied to data from a recent Italian
study.},
Doi = {10.1002/sim.2846},
Key = {fds257946}
}
@article{fds258068,
Author = {Dunson, DB and Pillai, N and Park, JH},
Title = {Bayesian density regression},
Journal = {Journal of the Royal Statistical Society. Series B:
Statistical Methodology},
Volume = {69},
Number = {2},
Pages = {163-183},
Publisher = {WILEY},
Year = {2007},
Month = {April},
ISSN = {1369-7412},
url = {http://dx.doi.org/10.1111/j.1467-9868.2007.00582.x},
Abstract = {The paper considers Bayesian methods for density regression,
allowing a random probability distribution to change
flexibly with multiple predictors. The conditional response
distribution is expressed as a non-parametric mixture of
regression models, with the mixture distribution changing
with predictors. A class of weighted mixture of Dirichlet
process priors is proposed for the uncountable collection of
mixture distributions. It is shown that this specification
results in a generalized Pólya urn scheme, which
incorporates weights that are dependent on the distance
between subjects' predictor values. To allow local
dependence in the mixture distributions, we propose a
kernel-based weighting scheme. A Gibbs sampling algorithm is
developed for posterior computation. The methods are
illustrated by using simulated data examples and an
epidemiologic application. © Royal Statistical
Society.},
Doi = {10.1111/j.1467-9868.2007.00582.x},
Key = {fds258068}
}
@article{fds258071,
Author = {Dunson, DB},
Title = {Empirical bayes density regression},
Journal = {Statistica Sinica},
Volume = {17},
Number = {2},
Pages = {481-504},
Year = {2007},
Month = {April},
ISSN = {1017-0405},
Abstract = {In Bayesian hierarchical modeling, it is often appealing to
allow the conditional density of an (observable or
unobservable) random variable Y to change flexibly with
categorical and continuous predictors X. A mixture of
regression models is proposed, with the mixture distribution
varying with X. Treating the smoothing parameters and number
of mixture components as unknown, the MLE does not exist,
motivating an empirical Bayes approach. The proposed method
shrinks the spatially-adaptive mixture distributions to a
common baseline, while penalizing rapid changes and large
numbers of components. The discrete form of the mixture
distribution facilitates flexible classification of
subjects. A Gibbs sampling algorithm is developed, which
embeds a Monte Carlo EM-type stage to estimate smoothing and
hyper-parameters. The method is applied to simulated
examples and data from an epidemiologic study.},
Key = {fds258071}
}
@article{fds257947,
Author = {Stanford, JB and Dunson, DB},
Title = {Effects of sexual intercourse patterns in time to pregnancy
studies.},
Journal = {American journal of epidemiology},
Volume = {165},
Number = {9},
Pages = {1088-1095},
Year = {2007},
Month = {May},
ISSN = {0002-9262},
url = {http://dx.doi.org/10.1093/aje/kwk111},
Abstract = {Time to pregnancy, typically defined as the number of
menstrual cycles required to achieve a clinical pregnancy,
is widely used as a measure of couple fecundity in
epidemiologic studies. Time to pregnancy studies seldom
utilize detailed data on the timing and frequency of sexual
intercourse and the timing of ovulation. However, the
simulated models in this paper illustrate that intercourse
behavior can have a large impact on time to pregnancy and,
likewise, on fecundability ratios, especially under
conditions of low intercourse frequency or low fecundity.
Because intercourse patterns in the menstrual cycles may
vary substantially among groups, it is important to consider
the effects of sexual behavior. Where relevant and feasible,
an assessment should be made of the timing and frequency of
intercourse relative to ovulation. Day-specific
probabilities of pregnancy can be used to account for the
effects of intercourse patterns. Depending on the research
hypothesis, intercourse patterns may be considered as a
potential confounder, mediator, or outcome.},
Doi = {10.1093/aje/kwk111},
Key = {fds257947}
}
@article{fds257979,
Author = {Xue, Y and Dunson, D and Carin, L},
Title = {The matrix stick-breaking process for flexible multi-task
learning},
Journal = {ACM International Conference Proceeding Series},
Volume = {227},
Pages = {1063-1070},
Publisher = {ACM Press},
Year = {2007},
Month = {August},
url = {http://dx.doi.org/10.1145/1273496.1273630},
Abstract = {In multi-task learning our goal is to design regression or
classification models for each of the tasks and
appropriately share information between tasks. A Dirichlet
process (DP) prior can be used to encourage task clustering.
However, the DP prior does not allow local clustering of
tasks with respect to a subset of the feature vector without
making independence assumptions. Motivated by this problem,
we develop a new multitask-learning prior, termed the matrix
stick-breaking process (MSBP), which encourages cross-task
sharing of data. However, the MSBP allows separate
clustering and borrowing of information for the different
feature components. This is important when tasks are more
closely related for certain features than for others.
Bayesian inference proceeds by a Gibbs sampling algorithm
and the approach is illustrated using a simulated example
and a multi-national application.},
Doi = {10.1145/1273496.1273630},
Key = {fds257979}
}
@article{fds257980,
Author = {Ni, K and Carin, L and Dunson, D},
Title = {Multi-task learning for sequential data via iHMMs and the
nested Dirichlet process},
Journal = {ACM International Conference Proceeding Series},
Volume = {227},
Pages = {689-696},
Publisher = {ACM Press},
Year = {2007},
Month = {August},
url = {http://dx.doi.org/10.1145/1273496.1273583},
Abstract = {A new hierarchical nonparametric Bayesian model is proposed
for the problem of multitask learning (MTL) with sequential
data. Sequential data are typically modeled with a hidden
Markov model (HMM), for which one often must choose an
appropriate model structure (number of states) before
learning. Here we model sequential data from each task with
an infinite hidden Markov model (iHMM), avoiding the problem
of model selection. The MTL for iHMMs is implemented by
imposing a nested Dirichlet process (nDP) prior on the base
distributions of the iHMMs. The nDP-iHMM MTL method allows
us to perform task-level clustering and data-level
clustering simultaneously, with which the learning for
individual iHMMs is enhanced and between-task similarities
are learned. Learning and inference for the nDP-iHMM MTL are
based on a Gibbs sampler. The effectiveness of the framework
is demonstrated using synthetic data as well as real music
data.},
Doi = {10.1145/1273496.1273583},
Key = {fds257980}
}
@article{fds258072,
Author = {Bigelow, JL and Dunson, DB},
Title = {Bayesian adaptive regression splines for hierarchical
data.},
Journal = {Biometrics},
Volume = {63},
Number = {3},
Pages = {724-732},
Year = {2007},
Month = {September},
ISSN = {0006-341X},
url = {http://www.ncbi.nlm.nih.gov/pubmed/17403106},
Abstract = {This article considers methodology for hierarchical
functional data analysis, motivated by studies of
reproductive hormone profiles in the menstrual cycle.
Current methods standardize the cycle lengths and ignore the
timing of ovulation within the cycle, both of which are
biologically informative. Methods are needed that avoid
standardization, while flexibly incorporating information on
covariates and the timing of reference events, such as
ovulation and onset of menses. In addition, it is necessary
to account for within-woman dependency when data are
collected for multiple cycles. We propose an approach based
on a hierarchical generalization of Bayesian multivariate
adaptive regression splines. Our formulation allows for an
unknown set of basis functions characterizing the
population-averaged and woman-specific trajectories in
relation to covariates. A reversible jump Markov chain Monte
Carlo algorithm is developed for posterior computation.
Applying the methods to data from the North Carolina Early
Pregnancy Study, we investigate differences in urinary
progesterone profiles between conception and nonconception
cycles.},
Doi = {10.1111/j.1541-0420.2007.00761.x},
Key = {fds258072}
}
@article{fds257948,
Author = {Dunson, DB},
Title = {Bayesian methods for latent trait modelling of longitudinal
data.},
Journal = {Statistical methods in medical research},
Volume = {16},
Number = {5},
Pages = {399-415},
Year = {2007},
Month = {October},
ISSN = {0962-2802},
url = {http://dx.doi.org/10.1177/0962280206075309},
Abstract = {Latent trait models have long been used in the social
science literature for studying variables that can only be
measured indirectly through multiple items. However, such
models are also very useful in accounting for correlation in
multivariate and longitudinal data, particularly when
outcomes have mixed measurement scales. Bayesian methods
implemented with Markov chain Monte Carlo provide a flexible
framework for routine fitting of a broad class of latent
variable (LV) models, including very general structural
equation models. However, in considering LV models, a number
of challenging issues arise, including identifiability,
confounding between the mean and variance, uncertainty in
different aspects of the model, and difficulty in
computation. Motivated by the problem of modelling
multidimensional longitudinal data, this article reviews the
recent literature, provides some recommendations and
highlights areas in need of additional research, focusing on
methods for model uncertainty.},
Doi = {10.1177/0962280206075309},
Key = {fds257948}
}
@article{fds258061,
Author = {Scarpa, B and Dunson, DB and Giacchi, E},
Title = {Bayesian selection of optimal rules for timing intercourse
to conceive by using calendar and mucus.},
Journal = {Fertility and sterility},
Volume = {88},
Number = {4},
Pages = {915-924},
Year = {2007},
Month = {October},
ISSN = {0015-0282},
url = {http://dx.doi.org/10.1016/j.fertnstert.2006.12.017},
Abstract = {<h4>Objective</h4>To find optimal clinical rules that
maximize the probability of conception while limiting the
number of intercourse days required.<h4>Design</h4>Multicenter
prospective study. Women were followed prospectively while
they kept daily records of menstrual bleeding, intercourse,
and mucus symptom characteristics. In some cycles, women
sought to conceive, whereas in other cycles, they sought to
avoid pregnancy.<h4>Setting</h4>Four centers providing
services on fertility awareness.<h4>Patient(s)</h4>One
hundred ninety-one healthy women using the Billings
Ovulation Method. Women were invited to enroll by their
instructors if they satisfied the entry criteria. We
excluded cycles in which mucus was not recorded on a day
with intercourse.<h4>Intervention(s)</h4>None.<h4>Main
outcome measure(s)</h4>Clinically identified pregnancies.
There were 161 clinically identified pregnancies in 2,536
menstrual cycles from 191 women.<h4>Result(s)</h4>Our
approach relies on a statistical model that relates daily
predictors, such as type of mucus symptom, to the
day-specific probabilities of conception. By using Bayesian
methods to search over a large set of possible clinical
rules, focusing on rules based on calendar and mucus, we
found that simple rules that are based on days within the
midcycle calendar interval that also have the most
fertile-type mucus symptom present have high
utility.<h4>Conclusion(s)</h4>Couples can shorten their time
to pregnancy efficiently by timing intercourse on days that
the most fertile-type mucus symptom is observed at the
vulva.},
Doi = {10.1016/j.fertnstert.2006.12.017},
Key = {fds258061}
}
@article{fds258063,
Author = {Pennell, ML and Dunson, DB},
Title = {Fitting semiparametric random effects models to large data
sets.},
Journal = {Biostatistics (Oxford, England)},
Volume = {8},
Number = {4},
Pages = {821-834},
Year = {2007},
Month = {October},
ISSN = {1465-4644},
url = {http://dx.doi.org/10.1093/biostatistics/kxm008},
Abstract = {For large data sets, it can be difficult or impossible to
fit models with random effects using standard algorithms due
to memory limitations or high computational burdens. In
addition, it would be advantageous to use the abundant
information to relax assumptions, such as normality of
random effects. Motivated by data from an epidemiologic
study of childhood growth, we propose a 2-stage method for
fitting semiparametric random effects models to longitudinal
data with many subjects. In the first stage, we use a
multivariate clustering method to identify G<<N groups of
subjects whose data have no scientifically important
differences, as defined by subject matter experts. Then, in
stage 2, group-specific random effects are assumed to come
from an unknown distribution, which is assigned a Dirichlet
process prior, further clustering the groups from stage 1.
We use our approach to model the effects of maternal smoking
during pregnancy on growth in 17,518 girls.},
Doi = {10.1093/biostatistics/kxm008},
Key = {fds258063}
}
@article{fds257866,
Author = {Palomo, J and Dunson, DB and Bollen, K},
Title = {Bayesian Structural Equation Modeling},
Pages = {163-188},
Publisher = {Elsevier},
Year = {2007},
Month = {December},
url = {http://dx.doi.org/10.1016/B978-044452044-9/50011-2},
Abstract = {This chapter focuses on Bayesian structural equation
modeling. Structural equation models (SEMs) with latent
variables are routinely used in social science research, and
are of increasing importance in biomedical applications.
Standard practice in implementing SEMs relies on frequentist
methods. The chapter provides a simple and concise
description of an alternative Bayesian approach. A
description of the Bayesian specification of SEMs, and an
outline of a Gibbs sampling strategy for model fitting is
also presented. Bayesian inferences are illustrated through
an industrialization and democratization case study. The
Bayesian approach has some distinct advantages, due to the
availability of samples from the joint posterior
distribution of the model parameters and latent variables,
which are highlighted in the chapter. These posterior
samples provide important information not contained in the
measurement and structural parameters. © 2007 Elsevier B.V.
All rights reserved.},
Doi = {10.1016/B978-044452044-9/50011-2},
Key = {fds257866}
}
@article{fds258062,
Author = {Cai, B and Dunson, DB},
Title = {Bayesian multivariate isotonic regression splines:
Applications to carcinogenicity studies},
Journal = {Journal of the American Statistical Association},
Volume = {102},
Number = {480},
Pages = {1158-1171},
Publisher = {Informa UK Limited},
Year = {2007},
Month = {December},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/016214506000000942},
Abstract = {In many applications, interest focuses on assessing the
relationship between a predictor and a multivariate outcome
variable, and there may be prior knowledge about the shape
of the regression curves. For example, regression functions
that relate dose of a possible risk factor to different
adverse outcomes can often be assumed to be nondecreasing.
In such cases, interest focuses on (1) assessing evidence of
an overall adverse effect, (2) determining which outcomes
are most affected, and (3) estimating outcome-specific
regression curves. This article proposes a Bayesian approach
for addressing this problem, motivated by multisite tumor
data from carcinogenicity experiments. A multivariate
smoothing spline model is specified, that accommodates
dependency in the multiple curves through a hierarchical
Markov random field prior for the basis coefficients, while
also allowing for residual correlation. A Gibbs sampler is
proposed for posterior computation, and the approach is
applied to data on body weight and tumor
occurrence.},
Doi = {10.1198/016214506000000942},
Key = {fds258062}
}
@article{fds258050,
Author = {Rodriguez, A and Dunson, DB and Gelfand, AE},
Title = {Nonparametric functional data analysis through Bayesian
density estimation},
Journal = {Biometrika},
Volume = {96},
Pages = {149-162},
Year = {2008},
Key = {fds258050}
}
@article{fds258058,
Author = {Rodriguez, A and Dunson, DB and Gelfand, AE},
Title = {The nested Dirichlet process (with discussion)},
Journal = {Journal of the American Statistical Association},
Year = {2008},
Key = {fds258058}
}
@article{fds258060,
Author = {Yang, M and Dunson, DB},
Title = {Bayesian semiparametric structural equation models with
latent variables},
Journal = {Psychometrika},
Volume = {75},
Number = {4},
Pages = {675-693},
Publisher = {Springer Nature},
Year = {2008},
ISSN = {0033-3123},
url = {http://dx.doi.org/10.1007/s11336-010-9174-4},
Abstract = {Structural equation models (SEMs) with latent variables are
widely useful for sparse covariance structure modeling and
for inferring relationships among latent variables. Bayesian
SEMs are appealing in allowing for the incorporation of
prior information and in providing exact posterior
distributions of unknowns, including the latent variables.
In this article, we propose a broad class of semiparametric
Bayesian SEMs, which allow mixed categorical and continuous
manifest variables while also allowing the latent variables
to have unknown distributions. In order to include typical
identifiability restrictions on the latent variable
distributions, we rely on centered Dirichlet process (CDP)
and CDP mixture (CDPM) models. The CDP will induce a latent
class model with an unknown number of classes, while the
CDPM will induce a latent trait model with unknown densities
for the latent traits. A simple and efficient Markov chain
Monte Carlo algorithm is developed for posterior
computation, and the methods are illustrated using simulated
examples, and several applications. © 2010 The Psychometric
Society.},
Doi = {10.1007/s11336-010-9174-4},
Key = {fds258060}
}
@article{fds257951,
Author = {Rodríguez, A and Dunson, DB and Gelfand, AE},
Title = {The nested dirichlet process},
Journal = {Journal of the American Statistical Association},
Volume = {103},
Number = {483},
Pages = {1131-1154},
Publisher = {Informa UK Limited},
Year = {2008},
Month = {January},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/016214508000000553},
Abstract = {In multicenter studies, subjects in different centers may
have different outcome distributions. This article is
motivated by the problem of nonparametric modeling of these
distributions, borrowing information across centers while
also allowing centers to be clustered. Starting with a
stick-breaking representation of the Dirichlet process (DP),
we replace the random atoms with random probability measures
drawn from a DP. This results in a nested DP prior, which
can be placed on the collection of distributions for the
different centers, with centers drawn from the same DP
component automatically clustered together. Theoretical
properties are discussed, and an efficient Markov chain
Monte Carlo algorithm is developed for computation. The
methods are illustrated using a simulation study and an
application to quality of care in U.S. hospitals.},
Doi = {10.1198/016214508000000553},
Key = {fds257951}
}
@article{fds257981,
Author = {Qi, Y and Liu, D and Dunson, D and Carin, L},
Title = {Multi-task compressive sensing with dirichlet process
priors},
Journal = {Proceedings of the 25th International Conference on Machine
Learning},
Pages = {768-775},
Year = {2008},
Month = {January},
url = {http://dx.doi.org/10.1145/1390156.1390253},
Abstract = {Compressive sensing (CS) is an emerging £eld that, under
appropriate conditions, can signi£cantly reduce the number
of measurements required for a given signal. In many
applications, one is interested in multiple signals that may
be measured in multiple CS-type measurements, where here
each signal corresponds to a sensing "task". In this paper
we propose a novel multitask compressive sensing framework
based on a Bayesian formalism, where a Dirichlet process
(DP) prior is employed, yielding a principled means of
simultaneously inferring the appropriate sharing mechanisms
as well as CS inversion for each task. A variational
Bayesian (VB) inference algorithm is employed to estimate
the full posterior on the model parameters. Copyright 2008
by the author(s)/owner(s).},
Doi = {10.1145/1390156.1390253},
Key = {fds257981}
}
@article{fds257982,
Author = {An, Q and Wang, C and Shterev, I and Wang, E and Carin, L and Dunson,
DB},
Title = {Hierarchical kernel stick-breaking process for multi-task
image analysis},
Journal = {Proceedings of the 25th International Conference on Machine
Learning},
Pages = {17-24},
Year = {2008},
Month = {January},
url = {http://dx.doi.org/10.1145/1390156.1390159},
Abstract = {The kernel stick-breaking process (KSBP) is employed to
segment general imagery, imposing the condition that patches
(small blocks of pixels) that are spatially proximate are
more likely to be associated with the same cluster
(segment). The number of clusters is not set a priori and is
inferred from the hierarchical Bayesian model. Further, KSBP
is integrated with a shared Dirichlet process prior to
simultaneously model multiple images, inferring their
inter-relationships. This latter application may be useful
for sorting and learning relationships between multiple
images. The Bayesian inference algorithm is based on a
hybrid of variational Bayesian analysis and local sampling.
In addition to providing details on the model and associated
inference framework, example results are presented for
several image-analysis problems. Copyright 2008 by the
author(s)/owner(s).},
Doi = {10.1145/1390156.1390159},
Key = {fds257982}
}
@article{fds257983,
Author = {Ren, L and Dunson, DB and Carin, L},
Title = {The dynamic hierarchical Dirichlet process},
Journal = {Proceedings of the 25th International Conference on Machine
Learning},
Pages = {824-831},
Year = {2008},
Month = {January},
url = {http://dx.doi.org/10.1145/1390156.1390260},
Abstract = {The dynamic hierarchical Dirichlet process (dHDP) is
developed to model the time-evolving statistical properties
of sequential data sets. The data collected at any time
point are represented via a mixture associated with an
appropriate underlying model, in the framework of HDP. The
statistical properties of data collected at consecutive time
points are linked via a random parameter that controls their
probabilistic similarity. The sharing mechanisms of the
time-evolving data are derived, and a relatively simple
Markov Chain Monte Carlo sampler is developed. Experimental
results are presented to demonstrate the model. Copyright
2008 by the author(s)/owner(s).},
Doi = {10.1145/1390156.1390260},
Key = {fds257983}
}
@article{fds258053,
Author = {Dunson, DB and Herring, A and Siega-Riz, AM},
Title = {Bayesian Inference on Changes in Response Densities over
Predictor Clusters.},
Journal = {Journal of the American Statistical Association},
Volume = {103},
Number = {484},
Pages = {1508-1517},
Publisher = {Informa UK Limited},
Year = {2008},
Month = {January},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/016214508000001039},
Abstract = {In epidemiology, it is often of interest to assess how
individuals with different trajectories over time in an
environmental exposure or biomarker differ with respect to a
continuous response. For ease in interpretation and
presentation of results, epidemiologists typically
categorize predictors prior to analysis. To extend this
approach to time-varying predictors, one can cluster
individuals by their predictor trajectory, with the cluster
index included as a predictor in a regression model for the
response. This article develops a semiparametric Bayes
approach, which avoids assuming a pre-specified number of
clusters and allows the response to vary nonparametrically
over predictor clusters. This methodology is motivated by
interest in relating trajectories in weight gain during
pregnancy to the distribution of birth weight adjusted for
gestational age at delivery. In this setting, the proposed
approach allows the tails of the birth weight density to
vary flexibly over weight gain clusters.},
Doi = {10.1198/016214508000001039},
Key = {fds258053}
}
@article{fds257949,
Author = {Dunson, DB},
Title = {Comment},
Journal = {Journal of the American Statistical Association},
Volume = {103},
Number = {481},
Pages = {40-41},
Publisher = {Informa UK Limited},
Year = {2008},
Month = {March},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/016214507000001436},
Doi = {10.1198/016214507000001436},
Key = {fds257949}
}
@article{fds258066,
Author = {Dunson, DB and Xue, Y and Carin, L},
Title = {The matrix stick-breaking process: Flexible Bayes
meta-analysis},
Journal = {Journal of the American Statistical Association},
Volume = {103},
Number = {481},
Pages = {317-327},
Publisher = {Informa UK Limited},
Year = {2008},
Month = {March},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/016214507000001364},
Abstract = {In analyzing data from multiple related studies, it often is
of interest to borrow information across studies and to
cluster similar studies. Although parametric hierarchical
models are commonly used, of concern is sensitivity to the
form chosen for the random-effects distribution. A Dirichlet
process (DP) prior can allow the distribution to be unknown,
while clustering studies; however, the DP does not allow
local clustering of studies with respect to a subset of the
coefficients without making independence assumptions.
Motivated by this problem, we propose a matrix
stick-breaking process (MSBP) as a prior for a matrix of
random probability measures. Properties of the MSBP are
considered, and methods are developed for posterior
computation using Markov chain Monte Carlo. Using the MSBP
as a prior for a matrix of study-specific regression
coefficients, we demonstrate advantages over parametric
modeling in simulated examples. The methods are further
illustrated using a multinational uterotrophic bioassay
study.},
Doi = {10.1198/016214507000001364},
Key = {fds258066}
}
@article{fds258059,
Author = {Dunson, DB and Herring, AH and Engel, SM},
Title = {Bayesian selection and clustering of polymorphisms in
functionally related genes},
Journal = {Journal of the American Statistical Association},
Volume = {103},
Number = {482},
Pages = {534-546},
Publisher = {Informa UK Limited},
Year = {2008},
Month = {June},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/016214507000000554},
Abstract = {In epidemiologic studies, there is often interest in
assessing the relationship between polymorphisms in
functionally related genes and a health outcome. For each
candidate gene, single nucleotide polymorphism (SNP) data
are collected at a number of locations, resulting in a large
number of possible genotypes. Because instabilities can
result in analyses that include all the SNPs, dimensionality
is typically reduced by conducting single SNP analyses or
attempting to identify haplotypes. This article proposes an
alternative Bayesian approach for reducing dimensionality. A
multilevel Dirichlet process prior is used for the
distribution of the SNP-specific regression coefficients
within genes, incorporating a variable selection-type
mixture structure to allow SNPs with no effect. This
structure allows simultaneous selection of important SNPs
and soft clustering of SNPs having similar impact on the
health outcome. The methods are illustrated using data from
a study of pro- and anti-inflammatory cytokine polymorphisms
and spontaneous preterm birth.},
Doi = {10.1198/016214507000000554},
Key = {fds258059}
}
@article{fds258064,
Author = {Pennell, ML and Dunson, DB},
Title = {Nonparametric bayes testing of changes in a response
distribution with an ordinal predictor.},
Journal = {Biometrics},
Volume = {64},
Number = {2},
Pages = {413-423},
Year = {2008},
Month = {June},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-0420.2007.00885.x},
Abstract = {In certain biomedical studies, one may anticipate changes in
the shape of a response distribution across the levels of an
ordinal predictor. For instance, in toxicology studies,
skewness and modality might change as dose increases. To
address this issue, we propose a Bayesian nonparametric
method for testing for distribution changes across an
ordinal predictor. Using a dynamic mixture of Dirichlet
processes, we allow the response distribution to change
flexibly at each level of the predictor. In addition, by
assigning mixture priors to the hyperparameters, we can
obtain posterior probabilities of no effect of the predictor
and identify the lowest dose level for which there is an
appreciable change in distribution. The method also provides
a natural framework for performing tests across multiple
outcomes. We apply our method to data from a genotoxicity
experiment.},
Doi = {10.1111/j.1541-0420.2007.00885.x},
Key = {fds258064}
}
@article{fds258065,
Author = {Dunson, DB and Park, JH},
Title = {Kernel stick-breaking processes},
Journal = {Biometrika},
Volume = {95},
Number = {2},
Pages = {307-323},
Publisher = {Oxford University Press (OUP)},
Year = {2008},
Month = {June},
ISSN = {0006-3444},
url = {http://dx.doi.org/10.1093/biomet/asn012},
Abstract = {We propose a class of kernel stick-breaking processes for
uncountable collections of dependent random probability
measures. The process is constructed by first introducing an
infinite sequence of random locations. Independent random
probability measures and beta-distributed random weights are
assigned to each location. Predictor-dependent random
probability measures are then constructed by mixing over the
locations, with stick-breaking probabilities expressed as a
kernel multiplied by the beta weights. Some theoretical
properties of the process are described, including a
covariate-dependent prediction rule. A retrospective Markov
chain Monte Carlo algorithm is developed for posterior
computation, and the methods are illustrated using a
simulated example and an epidemiological application.© US
Government/Department of Health and Human Services
2008.},
Doi = {10.1093/biomet/asn012},
Key = {fds258065}
}
@article{fds257950,
Author = {Elliott, L and Henderson, J and Northstone, K and Chiu, GY and Dunson,
D and London, SJ},
Title = {Prospective study of breast-feeding in relation to wheeze,
atopy, and bronchial hyperresponsiveness in the Avon
Longitudinal Study of Parents and Children
(ALSPAC).},
Journal = {The Journal of allergy and clinical immunology},
Volume = {122},
Number = {1},
Pages = {49-54.e3},
Year = {2008},
Month = {July},
ISSN = {0091-6749},
url = {http://dx.doi.org/10.1016/j.jaci.2008.04.001},
Abstract = {<h4>Background</h4>Breast-feeding clearly protects against
early wheezing, but recent data suggest that it might
increase later risk of atopic disease and
asthma.<h4>Objective</h4>We sought to examine the
relationship between breast-feeding and later asthma and
allergy outcomes by using data from the Avon Longitudinal
Study of Parents and Children, a large birth cohort in the
United Kingdom.<h4>Methods</h4>We used adjusted logistic
regression models to evaluate the association between
breast-feeding and atopy at age 7 years, bronchial
responsiveness to methacholine at age 8 years, and wheeze at
ages 3 and 7 1/2 years. Bayesian methods were used to assess
the possibility of bias caused by an influence of early
wheezing on the duration of breast-feeding, as well as
selection bias.<h4>Results</h4>Breast-feeding was protective
for wheeze in the first 3 years of life (odds ratio [OR] of
0.80 [95% CI, 0.70-0.90] for > or = 6 months relative to
never) but not wheeze (OR, 0.98; 95% CI, 0.79-1.22), atopy
(OR, 1.12; 95% CI, 0.92-1.35), or bronchial
hyperresponsiveness (OR, 1.07; 95% CI, 0.82-1.40) at ages 7
to 8 years. Bayesian models adjusting for the longer
duration of breast-feeding among children with wheezing in
early infancy produced virtually identical
results.<h4>Conclusions</h4>We did not find consistent
evidence for either a deleterious effect or a protective
effect of breast-feeding on later risk of allergic disease
in a large prospective birth cohort of children with
objective outcome measures and extensive data on potential
confounders and effect modifiers. Neither reverse causation
nor loss to follow-up appears to have materially biased our
results.},
Doi = {10.1016/j.jaci.2008.04.001},
Key = {fds257950}
}
@article{fds258057,
Author = {Ni, K and Paisley, J and Carin, L and Dunson, D},
Title = {Multi-task learning for analyzing and sorting large
databases of sequential data},
Journal = {IEEE Transactions on Signal Processing},
Volume = {56},
Number = {8 II},
Pages = {3918-3931},
Publisher = {Institute of Electrical and Electronics Engineers
(IEEE)},
Year = {2008},
Month = {August},
ISSN = {1053-587X},
url = {http://dx.doi.org/10.1109/TSP.2008.924798},
Abstract = {A new hierarchical nonparametric Bayesian framework is
proposed for the problem of multi-task learning (MTL) with
sequential data. The models for multiple tasks, each
characterized by sequential data, are learned jointly, and
the intertask relationships are obtained simultaneously.
This MTL setting is used to analyze and sort large databases
composed of sequential data, such as music clips. Within
each data set, we represent the sequential data with an
infinite hidden Markov model (iHMM), avoiding the problem of
model selection (selecting a number of states). Across the
data sets, the multiple iHMMs are learned jointly in a MTL
setting, employing a nested Dirichlet process (nDP). The
nDP-iHMM MTL method allows simultaneous task-level and
data-level clustering, with which the individual iHMMs are
enhanced and the between-task similarities are learned.
Therefore, in addition to improved learning of each of the
models via appropriate data sharing, the learned sharing
mechanisms are used to infer interdata relationships of
interest for data search. Specifically, the MTL-learned
task-level sharing mechanisms are used to define the
affinity matrix in a graph-diffusion sorting framework. To
speed up the MCMC inference for large databases, the
nDP-iHMM is truncated to yield a nested Dirichlet-distribution
based HMM representation, which accommodates fast
variational Bayesian (VB) analysis for large-scale
inference, and the effectiveness of the framework is
demonstrated using a database composed of 2500 digital music
pieces. © 2008 IEEE.},
Doi = {10.1109/TSP.2008.924798},
Key = {fds258057}
}
@article{fds257952,
Author = {Rodríguez, A and Dunson, DB and Gelfand, AE},
Title = {The nested Dirichlet process: Rejoinder},
Journal = {Journal of the American Statistical Association},
Volume = {103},
Number = {483},
Pages = {1153-1154},
Publisher = {Informa UK Limited},
Year = {2008},
Month = {September},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/016214508000000616},
Doi = {10.1198/016214508000000616},
Key = {fds257952}
}
@article{fds258052,
Author = {Dunson, DB and Peddada, SD},
Title = {Bayesian nonparametric inference on stochastic
ordering.},
Journal = {Biometrika},
Volume = {95},
Number = {4},
Pages = {859-874},
Publisher = {Oxford University Press (OUP)},
Year = {2008},
Month = {December},
ISSN = {0006-3444},
url = {http://dx.doi.org/10.1093/biomet/asn043},
Abstract = {This article considers Bayesian inference about collections
of unknown distributions subject to a partial stochastic
ordering. To address problems in testing of equalities
between groups and estimation of group-specific
distributions, we propose classes of restricted dependent
Dirichlet process priors. These priors have full support in
the space of stochastically ordered distributions, and can
be used for collections of unknown mixture distributions to
obtain a flexible class of mixture models. Theoretical
properties are discussed, efficient methods are developed
for posterior computation using Markov chain Monte Carlo,
and the methods are illustrated using data from a study of
DNA damage and repair.},
Doi = {10.1093/biomet/asn043},
Key = {fds258052}
}
@article{fds258031,
Author = {Armagan, A and Dunson, DB},
Title = {Sparse variational analysis of large longitudinal data
sets},
Journal = {Statistics & Probability Letters},
Year = {2009},
Key = {fds258031}
}
@article{fds258039,
Author = {Wang, L and Dunson, DB},
Title = {Semiparametric Bayes multiple testing: Applications to tumor
data.},
Journal = {Biometrics},
Volume = {66},
Number = {2},
Pages = {493-501},
Year = {2009},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-0420.2009.01301.x},
Abstract = {In National Toxicology Program (NTP) studies, investigators
want to assess whether a test agent is carcinogenic overall
and specific to certain tumor types, while estimating the
dose-response profiles. Because there are potentially
correlations among the tumors, a joint inference is
preferred to separate univariate analyses for each tumor
type. In this regard, we propose a random effect logistic
model with a matrix of coefficients representing log-odds
ratios for the adjacent dose groups for tumors at different
sites. We propose appropriate nonparametric priors for these
coefficients to characterize the correlations and to allow
borrowing of information across different dose groups and
tumor types. Global and local hypotheses can be easily
evaluated by summarizing the output of a single Monte Carlo
Markov chain (MCMC). Two multiple testing procedures are
applied for testing local hypotheses based on the posterior
probabilities of local alternatives. Simulation studies are
conducted and an NTP tumor data set is analyzed illustrating
the proposed approach.},
Doi = {10.1111/j.1541-0420.2009.01301.x},
Key = {fds258039}
}
@article{fds258040,
Author = {Wang, L and Dunson, DB},
Title = {Fast Bayesian inference in Dirichlet process mixture
models},
Journal = {Journal of Computational & Graphical Statistics},
Volume = {20},
Number = {1},
Pages = {196-216},
Publisher = {Informa UK Limited},
Year = {2009},
ISSN = {1061-8600},
url = {http://dx.doi.org/10.1198/jcgs.2010.07081},
Abstract = {There has been increasing interest in applying Bayesian
nonparametric methods in large samples and high dimensions.
As Markov chain Monte Carlo (MCMC) algorithms are often
infeasible, there is a pressing need for much faster
algorithms. This article proposes a fast approach for
inference in Dirichlet process mixture (DPM) models. Viewing
the partitioning of subjects into clusters as a model
selection problem, we propose a sequential greedy search
algorithm for selecting the partition. Then, when conjugate
priors are chosen, the resulting posterior conditionally on
the selected partition is available in closed form. This
approach allows testing of parametric models versus
nonparametric alternatives based on Bayes factors. We
evaluate the approach using simulation studies and compare
it with four other fast nonparametric methods in the
literature. We apply the proposed approach to three datasets
including one from a large epidemiologic study. Matlab codes
for the simulation and data analyses using the proposed
approach are available online in the supplemental
materials.},
Doi = {10.1198/jcgs.2010.07081},
Key = {fds258040}
}
@article{fds257953,
Author = {Rodríguez, A and Dunson, DB and Gelfand, AE},
Title = {Bayesian Nonparametric Functional Data Analysis Through
Density Estimation.},
Journal = {Biometrika},
Volume = {96},
Number = {1},
Pages = {149-162},
Year = {2009},
Month = {January},
ISSN = {0006-3444},
url = {http://dx.doi.org/10.1093/biomet/asn054},
Abstract = {In many modern experimental settings, observations are
obtained in the form of functions, and interest focuses on
inferences on a collection of such functions. We propose a
hierarchical model that allows us to simultaneously estimate
multiple curves nonparametrically by using dependent
Dirichlet Process mixtures of Gaussians to characterize the
joint distribution of predictors and outcomes. Function
estimates are then induced through the conditional
distribution of the outcome given the predictors. The
resulting approach allows for flexible estimation and
clustering, while borrowing information across curves. We
also show that the function estimates we obtain are
consistent on the space of integrable functions. As an
illustration, we consider an application to the analysis of
Conductivity and Temperature at Depth data in the north
Atlantic.},
Doi = {10.1093/biomet/asn054},
Key = {fds257953}
}
@article{fds258029,
Author = {Du, L and Ren, L and Dunson, DB and Carin, L},
Title = {A Bayesian Model for Simultaneous Image Clustering,
Annotation and Object Segmentation.},
Journal = {Advances in neural information processing
systems},
Volume = {2009},
Pages = {486-494},
Year = {2009},
Month = {January},
Abstract = {A non-parametric Bayesian model is proposed for processing
multiple images. The analysis employs image features and,
when present, the words associated with accompanying
annotations. The model clusters the images into classes, and
each image is segmented into a set of objects, also allowing
the opportunity to assign a word to each object (localized
labeling). Each object is assumed to be represented as a
heterogeneous mix of components, with this realized via
mixture models linking image features to object types. The
number of image classes, number of object types, and the
characteristics of the object-feature mixture models are
inferred nonparametrically. To constitute spatially
contiguous objects, a new logistic stick-breaking process is
developed. Inference is performed efficiently via
variational Bayesian analysis, with example results
presented on two image databases.},
Key = {fds258029}
}
@article{fds258047,
Author = {Rodriguez, A and Dunson, DB and Taylor, J},
Title = {Bayesian hierarchically weighted finite mixture models for
samples of distributions.},
Journal = {Biostatistics (Oxford, England)},
Volume = {10},
Number = {1},
Pages = {155-171},
Year = {2009},
Month = {January},
ISSN = {1465-4644},
url = {http://dx.doi.org/10.1093/biostatistics/kxn024},
Abstract = {Finite mixtures of Gaussian distributions are known to
provide an accurate approximation to any unknown density.
Motivated by DNA repair studies in which data are collected
for samples of cells from different individuals, we propose
a class of hierarchically weighted finite mixture models.
The modeling framework incorporates a collection of k
Gaussian basis distributions, with the individual-specific
response densities expressed as mixtures of these bases. To
allow heterogeneity among individuals and predictor effects,
we model the mixture weights, while treating the basis
distributions as unknown but common to all distributions.
This results in a flexible hierarchical model for samples of
distributions. We consider analysis of variance-type
structures and a parsimonious latent factor representation,
which leads to simplified inferences on non-Gaussian
covariance structures. Methods for posterior computation are
developed, and the model is used to select genetic
predictors of baseline DNA damage, susceptibility to induced
damage, and rate of repair.},
Doi = {10.1093/biostatistics/kxn024},
Key = {fds258047}
}
@article{fds258051,
Author = {Bigelow, JL and Dunson, DB},
Title = {Bayesian semiparametric joint models for functional
predictors.},
Journal = {Journal of the American Statistical Association},
Volume = {104},
Number = {485},
Pages = {26-36},
Publisher = {Informa UK Limited},
Year = {2009},
Month = {January},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/jasa.2009.0001},
Abstract = {Motivated by the need to understand and predict early
pregnancy loss using hormonal indicators of pregnancy
health, this paper proposes a semiparametric Bayes approach
for assessing the relationship between functional predictors
and a response. A multivariate adaptive spline model is used
to describe the functional predictors, and a generalized
linear model with a random intercept describes the response.
Through specifying the random intercept to follow a
Dirichlet process jointly with the random spline
coefficients, we obtain a procedure that clusters
trajectories according to shape and according to the
parameters of the response model for each cluster. This very
flexible method allows for the incorporation of covariates
in the models for both the response and the trajectory. We
apply the method to post-ovulatory progesterone data from
the Early Pregnancy Study and find that the model
successfully predicts early pregnancy loss.},
Doi = {10.1198/jasa.2009.0001},
Key = {fds258051}
}
@article{fds258054,
Author = {Dunson, DB},
Title = {Nonparametric Bayes local partition models for random
effects.},
Journal = {Biometrika},
Volume = {96},
Number = {2},
Pages = {249-262},
Year = {2009},
Month = {January},
ISSN = {0006-3444},
url = {http://dx.doi.org/10.1093/biomet/asp021},
Abstract = {This paper focuses on the problem of choosing a prior for an
unknown random effects distribution within a Bayesian
hierarchical model. The goal is to obtain a sparse
representation by allowing a combination of global and local
borrowing of information. A local partition process prior is
proposed, which induces dependent local clustering. Subjects
can be clustered together for a subset of their parameters,
and one learns about similarities between subjects
increasingly as parameters are added. Some basic properties
are described, including simple two-parameter expressions
for marginal and conditional clustering probabilities. A
slice sampler is developed which bypasses the need to
approximate the countably infinite random measure in
performing posterior computation. The methods are
illustrated using simulation examples, and an application to
hormone trajectory data.},
Doi = {10.1093/biomet/asp021},
Key = {fds258054}
}
@article{fds258028,
Author = {Ji, S and Dunson, D and Carin, L},
Title = {Multitask compressive sensing},
Journal = {IEEE Transactions on Signal Processing},
Volume = {57},
Number = {1},
Pages = {92-106},
Publisher = {Institute of Electrical and Electronics Engineers
(IEEE)},
Year = {2009},
Month = {January},
ISSN = {1053-587X},
url = {http://dx.doi.org/10.1109/TSP.2008.2005866},
Abstract = {Compressive sensing (CS) is a framework whereby one performs
N nonadaptive measurements to constitute a vector v∈ℝN
with v used to recover an approximation u∈RℝM to a
desired signal u∈RℝM with N≪ M; this is performed
under the assumption that uis sparse in the basis
represented by the matrix Ψ∈RℝM×M. It has been
demonstrated that with appropriate design of the compressive
measurements used to define v, the decompressive mapping
v⇁ umay be performed with error ∥u-u∥22 having
asymptotic properties analogous to those of the best
transform-coding algorithm applied in the basis Ψ. The
mapping v⇁u constitutes an inverse problem, often solved
using ℓ1 regularization or related techniques. In most
previous research, if L〉 sets of compressive measurements
vii=1,L are performed, each of the associated uii=1,L are
recovered one at a time, independently. In many applications
the "tasks"defined by the mappings vi⇁ ui are not
statistically independent, and it may be possible to improve
the performance of the inversion if statistical
interrelationships are exploited. In this paper, we address
this problem within a multitask learning setting, wherein
the mapping vi ⇁uifor each task corresponds to inferring
the parameters (here, wavelet coefficients) associated with
the desired signal ui, and a shared prior is placed across
all of the L tasks. Under this hierarchical Bayesian
modeling, data from all L tasks contribute toward inferring
a posterior on the hyperparameters, and once the shared
prior is thereby inferred, the data from each of the L
individual tasks is then employed to estimate the
task-dependent wavelet coefficients. An empirical Bayesian
procedure for the estimation of hyperparameters is
considered; two fast inference algorithms extending the
relevance vector machine (RVM) are developed. Example
results on several data sets demonstrate the effectiveness
and robustness of the proposed algorithms. © 2008
IEEE.},
Doi = {10.1109/TSP.2008.2005866},
Key = {fds258028}
}
@article{fds257954,
Author = {Dunson, DB},
Title = {Bayesian nonparametric hierarchical modeling.},
Journal = {Biometrical journal. Biometrische Zeitschrift},
Volume = {51},
Number = {2},
Pages = {273-284},
Year = {2009},
Month = {April},
url = {http://www.ncbi.nlm.nih.gov/pubmed/19358217},
Abstract = {In biomedical research, hierarchical models are very widely
used to accommodate dependence in multivariate and
longitudinal data and for borrowing of information across
data from different sources. A primary concern in
hierarchical modeling is sensitivity to parametric
assumptions, such as linearity and normality of the random
effects. Parametric assumptions on latent variable
distributions can be challenging to check and are typically
unwarranted, given available prior knowledge. This article
reviews some recent developments in Bayesian nonparametric
methods motivated by complex, multivariate and functional
data collected in biomedical studies. The author provides a
brief review of flexible parametric approaches relying on
finite mixtures and latent class modeling. Dirichlet process
mixture models are motivated by the need to generalize these
approaches to avoid assuming a fixed finite number of
classes. Focusing on an epidemiology application, the author
illustrates the practical utility and potential of
nonparametric Bayes methods.},
Doi = {10.1002/bimj.200800183},
Key = {fds257954}
}
@article{fds258056,
Author = {MacLehose, RF and Dunson, DB},
Title = {Nonparametric Bayes kernel-based priors for functional data
analysis},
Journal = {Statistica Sinica},
Volume = {19},
Number = {2},
Pages = {611-629},
Year = {2009},
Month = {April},
ISSN = {1017-0405},
Abstract = {We focus on developing nonparametric Bayes methods for
collections of dependent random functions, allowing
individual curves to vary flexibly while adaptively
borrowing information. A prior is proposed, which is
expressed as a hierarchical mixture of weighted kernels
placed at unknown locations. The induced prior for any
individual function is shown to fall within a reproducing
kernel Hilbert space. We allow flexible borrowing of
information through the use of a hierarchical Dirichlet
process prior for the random locations, along with a
functional Dirichlet process for the weights. Theoretical
properties are considered and an efficient MCMC algorithm is
developed, relying on stick-breaking truncations. The
methods are illustrated using simulation examples and an
application to reproductive hormone data.},
Key = {fds258056}
}
@article{fds258048,
Author = {Ghosh, J and Dunson, DB},
Title = {Default Prior Distributions and Efficient Posterior
Computation in Bayesian Factor Analysis.},
Journal = {Journal of computational and graphical statistics : a joint
publication of American Statistical Association, Institute
of Mathematical Statistics, Interface Foundation of North
America},
Volume = {18},
Number = {2},
Pages = {306-320},
Year = {2009},
Month = {June},
ISSN = {1061-8600},
url = {http://dx.doi.org/10.1198/jcgs.2009.07145},
Abstract = {Factor analytic models are widely used in social sciences.
These models have also proven useful for sparse modeling of
the covariance structure in multidimensional data. Normal
prior distributions for factor loadings and inverse gamma
prior distributions for residual variances are a popular
choice because of their conditionally conjugate form.
However, such prior distributions require elicitation of
many hyperparameters and tend to result in poorly behaved
Gibbs samplers. In addition, one must choose an informative
specification, as high variance prior distributions face
problems due to impropriety of the posterior distribution.
This article proposes a default, heavy-tailed prior
distribution specification, which is induced through
parameter expansion while facilitating efficient posterior
computation. We also develop an approach to allow
uncertainty in the number of factors. The methods are
illustrated through simulated examples and epidemiology and
toxicology applications. Data sets and computer code used in
this article are available online.},
Doi = {10.1198/jcgs.2009.07145},
Key = {fds258048}
}
@article{fds257955,
Author = {Baird, DD and Travlos, G and Wilson, R and Dunson, DB and Hill, MC and D'Aloisio, AA and London, SJ and Schectman, JM},
Title = {Uterine leiomyomata in relation to insulin-like growth
factor-I, insulin, and diabetes.},
Journal = {Epidemiology (Cambridge, Mass.)},
Volume = {20},
Number = {4},
Pages = {604-610},
Year = {2009},
Month = {July},
ISSN = {1044-3983},
url = {http://dx.doi.org/10.1097/ede.0b013e31819d8d3f},
Abstract = {<h4>Background</h4>Insulin-like growth factor-I (IGF-I) and
insulin stimulate cell proliferation in uterine leiomyoma
(fibroid) tissue. We hypothesized that circulating levels of
these proteins would be associated with increased prevalence
and size of uterine fibroids.<h4>Methods</h4>Participants
were 35-49-year-old, randomly selected members of an urban
health plan who were enrolled in the study in 1996-1999.
Premenopausal participants were screened for fibroids with
ultrasound. Fasting blood samples were collected.
Associations between fibroids and diabetes, plasma IGF-I,
IGF binding protein 3 (BP3), and insulin were evaluated for
blacks (n = 585) and whites (n = 403) by using multiple
logistic regression.<h4>Results</h4>IGF-I showed no
association with fibroids in blacks, but in whites the
adjusted odds ratios (aORs) for both mid and upper tertiles
compared with the lowest tertile were 0.6 (95% confidence
intervals [CI] = 0.3-1.0 and 0.4-1.1, respectively). Insulin
and diabetes both tended to be inversely associated with
fibroids in blacks. The insulin association was with large
fibroids; aOR for the upper insulin tertile relative to the
lowest was 0.4 (0.2-0.9). The aOR for diabetes was 0.5
(0.2-1.0). Associations of insulin and diabetes with
fibroids were weak for whites. Binding protein 3 showed no
association with fibroids.<h4>Conclusions</h4>Contrary to
our hypothesis, high circulating IGF-I and insulin were not
related to increased fibroid prevalence. Instead, there was
suggestion of the opposite. The inverse association with
diabetes, although based on small numbers, is consistent
with previously reported findings. Future studies might
investigate vascular dysfunction as a mediator between
hyperinsulinemia or diabetes and possible reduced risk of
fibroids.},
Doi = {10.1097/ede.0b013e31819d8d3f},
Key = {fds257955}
}
@article{fds258049,
Author = {Scarpa, B and Dunson, DB},
Title = {Bayesian hierarchical functional data analysis via
contaminated informative priors.},
Journal = {Biometrics},
Volume = {65},
Number = {3},
Pages = {772-780},
Year = {2009},
Month = {September},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-0420.2008.01163.x},
Abstract = {A variety of flexible approaches have been proposed for
functional data analysis, allowing both the mean curve and
the distribution about the mean to be unknown. Such methods
are most useful when there is limited prior information.
Motivated by applications to modeling of temperature curves
in the menstrual cycle, this article proposes a flexible
approach for incorporating prior information in
semiparametric Bayesian analyses of hierarchical functional
data. The proposed approach is based on specifying the
distribution of functions as a mixture of a parametric
hierarchical model and a nonparametric contamination. The
parametric component is chosen based on prior knowledge,
while the contamination is characterized as a functional
Dirichlet process. In the motivating application, the
contamination component allows unanticipated curve shapes in
unhealthy menstrual cycles. Methods are developed for
posterior computation, and the approach is applied to data
from a European fecundability study.},
Doi = {10.1111/j.1541-0420.2008.01163.x},
Key = {fds258049}
}
@article{fds257984,
Author = {Ren, L and Dunson, DB and Lindroth, S and Carin, L},
Title = {Music analysis with a Bayesian dynamic model},
Journal = {ICASSP, IEEE International Conference on Acoustics, Speech
and Signal Processing - Proceedings},
Pages = {1681-1684},
Publisher = {IEEE},
Year = {2009},
Month = {September},
ISSN = {1520-6149},
url = {http://dx.doi.org/10.1109/ICASSP.2009.4959925},
Abstract = {A Bayesian dynamic model is developed to model complex
sequential data, with a focus on audio signals from music.
The music is represented in terms of a sequence of discrete
observations, and the sequence is modeled using a hidden
Markov model (HMM) with time-evolving parameters. The model
imposes the belief that observations that are temporally
proximate are more likely to be drawn from HMMs with similar
parameters, while also allowing for "innovation" associated
with abrupt changes in the music texture. Segmentation of a
given musical piece is constituted via the model inference
and the results are compared with other models and also to a
conventional music-theoretic analysis. ©2009
IEEE.},
Doi = {10.1109/ICASSP.2009.4959925},
Key = {fds257984}
}
@article{fds257985,
Author = {Wang, C and An, Q and Carin, L and Dunson, DB},
Title = {Multi-task classification with infinite local
experts},
Journal = {ICASSP, IEEE International Conference on Acoustics, Speech
and Signal Processing - Proceedings},
Pages = {1569-1572},
Publisher = {IEEE},
Year = {2009},
Month = {September},
ISSN = {1520-6149},
url = {http://dx.doi.org/10.1109/ICASSP.2009.4959897},
Abstract = {We propose a multi-task learning (MTL) framework for
nonlinear classification, based on an infinite set of local
experts in feature space. The usage of local experts enables
sharing at the expert-level, encouraging the borrowing of
information even if tasks are similar only in subregions of
feature space. A kernel stick-breaking process (KSBP) prior
is imposed on the underlying distribution of class labels,
so that the number of experts is inferred in the posterior
and thus model selection issues are avoided. The MTL is
implemented by imposing a Dirichlet process (DP) prior on a
layer above the task- dependent KSBPs. ©2009
IEEE.},
Doi = {10.1109/ICASSP.2009.4959897},
Key = {fds257985}
}
@article{fds258024,
Author = {Dunson, DB},
Title = {Comment on article by Craigmile et al.},
Journal = {Bayesian Analysis},
Volume = {4},
Number = {1},
Pages = {41-44},
Publisher = {Institute of Mathematical Statistics},
Year = {2009},
Month = {December},
ISSN = {1936-0975},
url = {http://dx.doi.org/10.1214/09-BA401B},
Doi = {10.1214/09-BA401B},
Key = {fds258024}
}
@article{fds258044,
Author = {Chung, Y and Dunson, DB},
Title = {Nonparametric Bayes Conditional Distribution Modeling With
Variable Selection.},
Journal = {Journal of the American Statistical Association},
Volume = {104},
Number = {488},
Pages = {1646-1660},
Year = {2009},
Month = {December},
ISSN = {0162-1459},
url = {http://hdl.handle.net/10161/4398 Duke open
access},
Abstract = {This article considers a methodology for flexibly
characterizing the relationship between a response and
multiple predictors. Goals are (1) to estimate the
conditional response distribution addressing the
distributional changes across the predictor space, and (2)
to identify important predictors for the response
distribution change both within local regions and globally.
We first introduce the probit stick-breaking process (PSBP)
as a prior for an uncountable collection of
predictor-dependent random distributions and propose a PSBP
mixture (PSBPM) of normal regressions for modeling the
conditional distributions. A global variable selection
structure is incorporated to discard unimportant predictors,
while allowing estimation of posterior inclusion
probabilities. Local variable selection is conducted relying
on the conditional distribution estimates at different
predictor points. An efficient stochastic search sampling
algorithm is proposed for posterior computation. The methods
are illustrated through simulation and applied to an
epidemiologic study.},
Doi = {10.1198/jasa.2009.tm08302},
Key = {fds258044}
}
@article{fds257956,
Author = {Mitra, R and Dunson, D},
Title = {Two-level stochastic search variable selection in GLMs with
missing predictors.},
Journal = {The international journal of biostatistics},
Volume = {6},
Number = {1},
Pages = {Article-33},
Year = {2010},
Month = {January},
ISSN = {1557-4679},
url = {http://dx.doi.org/10.2202/1557-4679.1173},
Abstract = {Stochastic search variable selection (SSVS) algorithms
provide an appealing and widely used approach for searching
for good subsets of predictors while simultaneously
estimating posterior model probabilities and model-averaged
predictive distributions. This article proposes a two-level
generalization of SSVS to account for missing predictors
while accommodating uncertainty in the relationships between
these predictors. Bayesian approaches for allowing
predictors that are missing at random require a model on the
joint distribution of the predictors. We show that
predictive performance can be improved by allowing
uncertainty in the specification of predictor relationships
in this model. The methods are illustrated through
simulation studies and analysis of an epidemiologic data
set.},
Doi = {10.2202/1557-4679.1173},
Key = {fds257956}
}
@article{fds374092,
Author = {Wang, E and Liu, D and Silva, J and Dunson, D and Carin,
L},
Title = {Joint analysis of time-evolving binary matrices and
associated documents},
Journal = {Advances in Neural Information Processing Systems 23: 24th
Annual Conference on Neural Information Processing Systems
2010, NIPS 2010},
Year = {2010},
Month = {January},
ISBN = {9781617823800},
Abstract = {We consider problems for which one has incomplete binary
matrices that evolve with time (e:g:, the votes of
legislators on particular legislation, with each year
characterized by a different such matrix). An objective of
such analysis is to infer structure and inter-relationships
underlying the matrices, here defined by latent features
associated with each axis of the matrix. In addition, it is
assumed that documents are available for the entities
associated with at least one of the matrix axes. By jointly
analyzing the matrices and documents, one may be used to
inform the other within the analysis, and the model offers
the opportunity to predict matrix values (e:g:, votes) based
only on an associated document (e:g:, legislation). The
research presented here merges two areas of machine-learning
that have previously been investigated separately:
incomplete-matrix analysis and topic modeling. The analysis
is performed from a Bayesian perspective, with efficient
inference constituted via Gibbs sampling. The framework is
demonstrated by considering all voting data and available
documents (legislation) during the 220-year lifetime of the
United States Senate and House of Representatives.},
Key = {fds374092}
}
@article{fds257991,
Author = {Wang, C and Liao, X and Carin, L and Dunson, DB},
Title = {Classification with Incomplete Data Using Dirichlet Process
Priors.},
Journal = {Journal of machine learning research : JMLR},
Volume = {11},
Pages = {3269-3311},
Year = {2010},
Month = {March},
ISSN = {1532-4435},
Abstract = {A non-parametric hierarchical Bayesian framework is
developed for designing a classifier, based on a mixture of
simple (linear) classifiers. Each simple classifier is
termed a local "expert", and the number of experts and their
construction are manifested via a Dirichlet process
formulation. The simple form of the "experts" allows
analytical handling of incomplete data. The model is
extended to allow simultaneous design of classifiers on
multiple data sets, termed multi-task learning, with this
also performed non-parametrically via the Dirichlet process.
Fast inference is performed using variational Bayesian (VB)
analysis, and example results are presented for several data
sets. We also perform inference via Gibbs sampling, to which
we compare the VB results.},
Key = {fds257991}
}
@article{fds258037,
Author = {Rodríguez, A and Dunson, DB and Gelfand, AE},
Title = {Latent Stick-Breaking Processes.},
Journal = {Journal of the American Statistical Association},
Volume = {105},
Number = {490},
Pages = {647-659},
Year = {2010},
Month = {April},
ISSN = {0162-1459},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23559690},
Abstract = {We develop a model for stochastic processes with random
marginal distributions. Our model relies on a stick-breaking
construction for the marginal distribution of the process,
and introduces dependence across locations by using a latent
Gaussian copula model as the mechanism for selecting the
atoms. The resulting latent stick-breaking process (LaSBP)
induces a random partition of the index space, with points
closer in space having a higher probability of being in the
same cluster. We develop an efficient and straightforward
Markov chain Monte Carlo (MCMC) algorithm for computation
and discuss applications in financial econometrics and
ecology. This article has supplementary material
online.},
Doi = {10.1198/jasa.2010.tm08241},
Key = {fds258037}
}
@article{fds258041,
Author = {Maclehose, RF and Dunson, DB},
Title = {Bayesian semiparametric multiple shrinkage.},
Journal = {Biometrics},
Volume = {66},
Number = {2},
Pages = {455-462},
Year = {2010},
Month = {June},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-0420.2009.01275.x},
Abstract = {High-dimensional and highly correlated data leading to non-
or weakly identified effects are commonplace. Maximum
likelihood will typically fail in such situations and a
variety of shrinkage methods have been proposed. Standard
techniques, such as ridge regression or the lasso, shrink
estimates toward zero, with some approaches allowing
coefficients to be selected out of the model by achieving a
value of zero. When substantive information is available,
estimates can be shrunk to nonnull values; however, such
information may not be available. We propose a Bayesian
semiparametric approach that allows shrinkage to multiple
locations. Coefficients are given a mixture of heavy-tailed
double exponential priors, with location and scale
parameters assigned Dirichlet process hyperpriors to allow
groups of coefficients to be shrunk toward the same,
possibly nonzero, mean. Our approach favors sparse, but
flexible, structure by shrinking toward a small number of
random locations. The methods are illustrated using a study
of genetic polymorphisms and Parkinson's
disease.},
Doi = {10.1111/j.1541-0420.2009.01275.x},
Key = {fds258041}
}
@article{fds258042,
Author = {Ren, L and Dunson, D and Lindroth, S and Carin, L},
Title = {Dynamic nonparametric bayesian models for analysis of
music},
Journal = {Journal of the American Statistical Association},
Volume = {105},
Number = {490},
Pages = {458-472},
Publisher = {Informa UK Limited},
Year = {2010},
Month = {June},
ISSN = {0162-1459},
url = {http://hdl.handle.net/10161/4397 Duke open
access},
Abstract = {The dynamic hierarchical Dirichlet process (dHDP) is
developed to model complex sequential data, with a focus on
audio signals from music. The music is represented in terms
of a sequence of discrete observations, and the sequence is
modeled using a hidden Markov model (HMM) with time-evolving
parameters. The dHDP imposes the belief that observations
that are temporally proximate are more likely to be drawn
from HMMs with similar parameters, while also allowing for
"innovation" associated with abrupt changes in the music
texture. The sharing mechanisms of the time-evolving model
are derived, and for inference a relatively simple Markov
chain Monte Carlo sampler is developed. Segmentation of a
given musical piece is constituted via the model inference.
Detailed examples are presented on several pieces, with
comparisons to other models. The dHDP results are also
compared with a conventional music-theoretic analysis. All
the supplemental materials used by this paper are available
online. © 2010 American Statistical Association.},
Doi = {10.1198/jasa.2009.ap08497},
Key = {fds258042}
}
@article{fds258032,
Author = {Bornkamp, B and Ickstadt, K and Dunson, D},
Title = {Stochastically ordered multiple regression.},
Journal = {Biostatistics (Oxford, England)},
Volume = {11},
Number = {3},
Pages = {419-431},
Year = {2010},
Month = {July},
ISSN = {1465-4644},
url = {http://dx.doi.org/10.1093/biostatistics/kxq001},
Abstract = {In various application areas, prior information is available
about the direction of the effects of multiple predictors on
the conditional response distribution. For example, in
epidemiology studies of potentially adverse exposures and
continuous health responses, one can typically assume a
priori that increasing the level of an exposure does not
lead to an improvement in the health response. Such an
assumption can be formalized through a stochastic ordering
assumption in each of the exposures, leading to a
potentially large improvement in efficiency in nonparametric
modeling of the conditional response distribution. This
article proposes a Bayesian nonparametric approach to this
problem based on characterizing the conditional response
density as a Gaussian mixture, with the locations of the
Gaussian means varying flexibly with predictors subject to
minimal constraints to ensure stochastic ordering.
Theoretical properties are considered and Markov chain Monte
Carlo methods are developed for posterior computation. The
methods are illustrated using simulation examples and a
reproductive epidemiology application.},
Doi = {10.1093/biostatistics/kxq001},
Key = {fds258032}
}
@article{fds258043,
Author = {Park, JH and Dunson, DB},
Title = {Bayesian generalized product partition model},
Journal = {Statistica Sinica},
Volume = {20},
Number = {3},
Pages = {1203-1226},
Year = {2010},
Month = {July},
ISSN = {1017-0405},
url = {http://hdl.handle.net/10161/4623 Duke open
access},
Abstract = {Starting with a carefully formulated Dirichlet process (DP)
mixture model, we derive a generalized product partition
model (GPPM) in which the partition process is
predictor-dependent. The GPPM generalizes DP clustering to
relax the exchangeability assumption through the
incorporation of predictors, resulting in a generalized
Pólya urn scheme. In addition, the GPPM can be used for
formulating flexible semiparametric Bayes models for
conditional distribution estimation, bypassing the need for
expensive computation of large numbers of unknowns
characterizing priors for dependent collections of random
probability measures. A variety of special cases are
considered, and an efficient Gibbs sampling algorithm is
developed for posterior computation. The methods are
illustrated using simulation examples and an epidemiologic
application.},
Key = {fds258043}
}
@article{fds257958,
Author = {Stanford, JB and Mikolajczyk, RT and Dunson, DB},
Title = {Are Chinese people really more fertile?},
Journal = {Fertility and sterility},
Volume = {94},
Number = {3},
Pages = {e58},
Year = {2010},
Month = {August},
ISSN = {0015-0282},
url = {http://dx.doi.org/10.1016/j.fertnstert.2010.05.004},
Doi = {10.1016/j.fertnstert.2010.05.004},
Key = {fds257958}
}
@article{fds257959,
Author = {Yang, M and Dunson, DB and Baird, D},
Title = {Semiparametric Bayes hierarchical models with mean and
variance constraints.},
Journal = {Computational statistics & data analysis},
Volume = {54},
Number = {9},
Pages = {2172-2186},
Year = {2010},
Month = {September},
ISSN = {0167-9473},
url = {http://dx.doi.org/10.1016/j.csda.2010.03.025},
Abstract = {In parametric hierarchical models, it is standard practice
to place mean and variance constraints on the latent
variable distributions for the sake of identifiability and
interpretability. Because incorporation of such constraints
is challenging in semiparametric models that allow latent
variable distributions to be unknown, previous methods
either constrain the median or avoid constraints. In this
article, we propose a centered stick-breaking process
(CSBP), which induces mean and variance constraints on an
unknown distribution in a hierarchical model. This is
accomplished by viewing an unconstrained stick-breaking
process as a parameter-expanded version of a CSBP. An
efficient blocked Gibbs sampler is developed for approximate
posterior computation. The methods are illustrated through a
simulated example and an epidemiologic application.},
Doi = {10.1016/j.csda.2010.03.025},
Key = {fds257959}
}
@article{fds258034,
Author = {Cai, B and Dunson, DB and Stanford, JB},
Title = {Dynamic model for multivariate markers of
fecundability.},
Journal = {Biometrics},
Volume = {66},
Number = {3},
Pages = {905-913},
Year = {2010},
Month = {September},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-0420.2009.01327.x},
Abstract = {Dynamic latent class models provide a flexible framework for
studying biologic processes that evolve over time. Motivated
by studies of markers of the fertile days of the menstrual
cycle, we propose a discrete-time dynamic latent class
framework, allowing change points to depend on time, fixed
predictors, and random effects. Observed data consist of
multivariate categorical indicators, which change
dynamically in a flexible manner according to latent class
status. Given the flexibility of the framework, which
incorporates semi-parametric components using mixtures of
betas, identifiability constraints are needed to define the
latent classes. Such constraints are most appropriately
based on the known biology of the process. The Bayesian
method is developed particularly for analyzing mucus symptom
data from a study of women using natural family
planning.},
Doi = {10.1111/j.1541-0420.2009.01327.x},
Key = {fds258034}
}
@article{fds258046,
Author = {Dunson, DB},
Title = {MULTIVARIATE KERNEL PARTITION PROCESS MIXTURES.},
Journal = {Statistica Sinica},
Volume = {20},
Number = {4},
Pages = {1395-1422},
Year = {2010},
Month = {October},
ISSN = {1017-0405},
Abstract = {Mixtures provide a useful approach for relaxing parametric
assumptions. Discrete mixture models induce clusters,
typically with the same cluster allocation for each
parameter in multivariate cases. As a more flexible approach
that facilitates sparse nonparametric modeling of
multivariate random effects distributions, this article
proposes a kernel partition process (KPP) in which the
cluster allocation varies for different parameters. The KPP
is shown to be the driving measure for a multivariate
ordered Chinese restaurant process that induces a
highly-flexible dependence structure in local clustering.
This structure allows the relative locations of the random
effects to inform the clustering process, with
spatially-proximal random effects likely to be assigned the
same cluster index. An exact block Gibbs sampler is
developed for posterior computation, avoiding truncation of
the infinite measure. The methods are applied to hormone
curve data, and a dependent KPP is proposed for
classification from functional predictors.},
Key = {fds258046}
}
@article{fds257988,
Author = {Blei, D and Carin, L and Dunson, D},
Title = {Probabilistic Topic Models: A focus on graphical model
design and applications to document and image
analysis.},
Journal = {IEEE signal processing magazine},
Volume = {27},
Number = {6},
Pages = {55-65},
Year = {2010},
Month = {November},
ISSN = {1053-5888},
url = {http://dx.doi.org/10.1109/msp.2010.938079},
Abstract = {In this article, we review probabilistic topic models:
graphical models that can be used to summarize a large
collection of documents with a smaller number of
distributions over words. Those distributions are called
¿topics¿ because, when fit to data, they capture the
salient themes that run through the collection. We describe
both finite-dimensional parametric topic models and their
Bayesian nonparametric counterparts, which are based on the
hierarchical Dirichlet process (HDP). We discuss two
extensions of topic models to time-series data¿one that
lets the topics slowly change over time and one that lets
the assumed prevalence of the topics change. Finally, we
illustrate the application of topic models to nontext data,
summarizing some recent research results in image analysis.
© 2010 IEEE.},
Doi = {10.1109/msp.2010.938079},
Key = {fds257988}
}
@article{fds257990,
Author = {Chen, B and Chen, M and Paisley, J and Zaas, A and Woods, C and Ginsburg,
GS and Hero, A and Lucas, J and Dunson, D and Carin,
L},
Title = {Bayesian inference of the number of factors in
gene-expression analysis: application to human virus
challenge studies.},
Journal = {BMC Bioinformatics},
Volume = {11},
Pages = {552},
Year = {2010},
Month = {November},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21062443},
Abstract = {BACKGROUND: Nonparametric Bayesian techniques have been
developed recently to extend the sophistication of factor
models, allowing one to infer the number of appropriate
factors from the observed data. We consider such techniques
for sparse factor analysis, with application to
gene-expression data from three virus challenge studies.
Particular attention is placed on employing the Beta Process
(BP), the Indian Buffet Process (IBP), and related
sparseness-promoting techniques to infer a proper number of
factors. The posterior density function on the model
parameters is computed using Gibbs sampling and variational
Bayesian (VB) analysis. RESULTS: Time-evolving
gene-expression data are considered for respiratory
syncytial virus (RSV), Rhino virus, and influenza, using
blood samples from healthy human subjects. These data were
acquired in three challenge studies, each executed after
receiving institutional review board (IRB) approval from
Duke University. Comparisons are made between several
alternative means of per-forming nonparametric factor
analysis on these data, with comparisons as well to
sparse-PCA and Penalized Matrix Decomposition (PMD), closely
related non-Bayesian approaches. CONCLUSIONS: Applying the
Beta Process to the factor scores, or to the singular values
of a pseudo-SVD construction, the proposed algorithms infer
the number of factors in gene-expression data. For real data
the "true" number of factors is unknown; in our simulations
we consider a range of noise variances, and the proposed
Bayesian models inferred the number of factors accurately
relative to other methods in the literature, such as
sparse-PCA and PMD. We have also identified a "pan-viral"
factor of importance for each of the three viruses
considered in this study. We have identified a set of genes
associated with this pan-viral factor, of interest for early
detection of such viruses based upon the host response, as
quantified via gene-expression data.},
Doi = {10.1186/1471-2105-11-552},
Key = {fds257990}
}
@article{fds257963,
Author = {Gordon, GJ and Dunson, D},
Title = {Preface to the Proceedings of AISTATS 2011},
Journal = {Journal of Machine Learning Research},
Volume = {9},
Pages = {1-2},
Year = {2010},
Month = {December},
ISSN = {1532-4435},
Key = {fds257963}
}
@article{fds257987,
Author = {Wang, E and Liu, D and Silva, J and Dunson, D and Carin,
L},
Title = {Joint analysis of time-evolving binary matrices and
associated documents},
Journal = {Advances in Neural Information Processing Systems 23: 24th
Annual Conference on Neural Information Processing Systems
2010, NIPS 2010},
Year = {2010},
Month = {December},
Abstract = {We consider problems for which one has incomplete binary
matrices that evolve with time (e:g:, the votes of
legislators on particular legislation, with each year
characterized by a different such matrix). An objective of
such analysis is to infer structure and inter-relationships
underlying the matrices, here defined by latent features
associated with each axis of the matrix. In addition, it is
assumed that documents are available for the entities
associated with at least one of the matrix axes. By jointly
analyzing the matrices and documents, one may be used to
inform the other within the analysis, and the model offers
the opportunity to predict matrix values (e:g:, votes) based
only on an associated document (e:g:, legislation). The
research presented here merges two areas of machine-learning
that have previously been investigated separately:
incomplete-matrix analysis and topic modeling. The analysis
is performed from a Bayesian perspective, with efficient
inference constituted via Gibbs sampling. The framework is
demonstrated by considering all voting data and available
documents (legislation) during the 220-year lifetime of the
United States Senate and House of Representatives.},
Key = {fds257987}
}
@article{fds257989,
Author = {Chen, M and Silva, J and Paisley, J and Wang, C and Dunson, D and Carin,
L},
Title = {Compressive Sensing on Manifolds Using a Nonparametric
Mixture of Factor Analyzers: Algorithm and Performance
Bounds.},
Journal = {IEEE transactions on signal processing : a publication of
the IEEE Signal Processing Society},
Volume = {58},
Number = {12},
Pages = {6140-6155},
Year = {2010},
Month = {December},
ISSN = {1053-587X},
url = {http://dx.doi.org/10.1109/tsp.2010.2070796},
Abstract = {Nonparametric Bayesian methods are employed to constitute a
mixture of low-rank Gaussians, for data <b><i>x</i></b> ∈
ℝ <sup><i>N</i></sup> that are of high dimension <i>N</i>
but are constrained to reside in a low-dimensional subregion
of ℝ <sup><i>N</i></sup> . The number of mixture
components and their rank are inferred automatically from
the data. The resulting algorithm can be used for learning
manifolds and for reconstructing signals from manifolds,
based on compressive sensing (CS) projection measurements.
The statistical CS inversion is performed analytically. We
derive the required number of CS random measurements needed
for successful reconstruction, based on easily-computed
quantities, drawing on block-sparsity properties. The
proposed methodology is validated on several synthetic and
real datasets.},
Doi = {10.1109/tsp.2010.2070796},
Key = {fds257989}
}
@article{fds258036,
Author = {Bhattacharya, A and Dunson, DB},
Title = {Nonparametric Bayesian density estimation on manifolds with
applications to planar shapes.},
Journal = {Biometrika},
Volume = {97},
Number = {4},
Pages = {851-865},
Year = {2010},
Month = {December},
ISSN = {0006-3444},
url = {http://www.ncbi.nlm.nih.gov/pubmed/22822255},
Abstract = {Statistical analysis on landmark-based shape spaces has
diverse applications in morphometrics, medical diagnostics,
machine vision and other areas. These shape spaces are
non-Euclidean quotient manifolds. To conduct nonparametric
inferences, one may define notions of centre and spread on
this manifold and work with their estimates. However, it is
useful to consider full likelihood-based methods, which
allow nonparametric estimation of the probability density.
This article proposes a broad class of mixture models
constructed using suitable kernels on a general compact
metric space and then on the planar shape space in
particular. Following a Bayesian approach with a
nonparametric prior on the mixing distribution, conditions
are obtained under which the Kullback-Leibler property
holds, implying large support and weak posterior
consistency. Gibbs sampling methods are developed for
posterior computation, and the methods are applied to
problems in density estimation and classification with
shape-based predictors. Simulation studies show improved
estimation performance relative to existing
approaches.},
Doi = {10.1093/biomet/asq044},
Key = {fds258036}
}
@article{fds257986,
Author = {Zhou, M and Wang, C and Chen, M and Paisley, J and Dunson, D and Carin,
L},
Title = {Nonparametric bayesian matrix completion},
Journal = {2010 IEEE Sensor Array and Multichannel Signal Processing
Workshop, SAM 2010},
Pages = {213-216},
Publisher = {IEEE},
Year = {2010},
Month = {December},
url = {http://dx.doi.org/10.1109/SAM.2010.5606741},
Abstract = {The Beta-Binomial processes are considered for inferring
missing values in matrices. The model moves beyond the
low-rank assumption, modeling the matrix columns as residing
in a nonlinear subspace. Large-scale problems are considered
via efficient Gibbs sampling, yielding predictions as well
as a measure of confidence in each prediction. Algorithm
performance is considered for several datasets, with
encouraging performance relative to existing approaches. ©
2010 IEEE.},
Doi = {10.1109/SAM.2010.5606741},
Key = {fds257986}
}
@article{fds257978,
Author = {Armagan, A and Dunson, DB and Clyde, MA},
Title = {Generalized Beta Mixtures of Gaussians},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {24},
Pages = {523-531},
Publisher = {Neural Information Processing Systems Foundation,
Inc},
Editor = {Shawe-Taylor, J and Zemel, RS and Bartlett, PL},
Year = {2011},
ISBN = {9781618395993},
Abstract = {In recent years, a rich variety of shrinkage priors have
been proposed that have great promise in addressing massive
regression problems. In general, these new priors can be
expressed as scale mixtures of normals, but have more
complex forms and better properties than traditional Cauchy
and double exponential priors. We first propose a new class
of normal scale mixtures through a novel generalized beta
distribution that encompasses many interesting priors as
special cases. This encompassing framework should prove
useful in comparing competing priors, considering properties
and revealing close connections. We then develop a class of
variational Bayes approximations through the new hierarchy
presented that will scale more efficiently to the types of
truly massive data sets that are now encountered
routinely.},
Key = {fds257978}
}
@article{fds257992,
Author = {Ren, L and Du, L and Carin, L and Dunson, DB},
Title = {Logistic Stick-Breaking Process.},
Journal = {Journal of machine learning research : JMLR},
Volume = {12},
Number = {Jan},
Pages = {203-239},
Year = {2011},
Month = {January},
ISSN = {1532-4435},
Abstract = {A logistic stick-breaking process (LSBP) is proposed for
non-parametric clustering of general spatially- or
temporally-dependent data, imposing the belief that
proximate data are more likely to be clustered together. The
sticks in the LSBP are realized via multiple logistic
regression functions, with shrinkage priors employed to
favor contiguous and spatially localized segments. The LSBP
is also extended for the simultaneous processing of multiple
data sets, yielding a hierarchical logistic stick-breaking
process (H-LSBP). The model parameters (atoms) within the
H-LSBP are shared across the multiple learning tasks.
Efficient variational Bayesian inference is derived, and
comparisons are made to related techniques in the
literature. Experimental analysis is performed for audio
waveforms and images, and it is demonstrated that for
segmentation applications the LSBP yields generally
homogeneous segments with sharp boundaries.},
Key = {fds257992}
}
@article{fds257995,
Author = {Chen, H and Dunson, DB and Carin, L},
Title = {Topic Modeling with Nonparametric Markov
Tree.},
Journal = {Proceedings of the ... International Conference on Machine
Learning. International Conference on Machine
Learning},
Volume = {2011},
Pages = {377-384},
Year = {2011},
Month = {January},
Abstract = {A new hierarchical tree-based topic model is developed,
based on nonparametric Bayesian techniques. The model has
two unique attributes: (<i>i</i>) a child node in the tree
may have more than one parent, with the goal of eliminating
redundant sub-topics deep in the tree; and (<i>ii</i>)
parsimonious sub-topics are manifested, by removing
redundant usage of words at multiple scales. The depth and
width of the tree are unbounded within the prior, with a
retrospective sampler employed to adaptively infer the
appropriate tree size based upon the corpus under study.
Excellent quantitative results are manifested on five
standard data sets, and the inferred tree structure is also
found to be highly interpretable.},
Key = {fds257995}
}
@article{fds257996,
Author = {Zhang, X and Dunson, DB and Carin, L},
Title = {Tree-Structured Infinite Sparse Factor Model.},
Journal = {Proceedings of the ... International Conference on Machine
Learning. International Conference on Machine
Learning},
Volume = {2011},
Pages = {785-792},
Year = {2011},
Month = {January},
Abstract = {A tree-structured multiplicative gamma process (TMGP) is
developed, for inferring the depth of a tree-based
factor-analysis model. This new model is coupled with the
nested Chinese restaurant process, to nonparametrically
infer the depth and width (structure) of the tree. In
addition to developing the model, theoretical properties of
the TMGP are addressed, and a novel MCMC sampler is
developed. The structure of the inferred tree is used to
learn relationships between high-dimensional data, and the
model is also applied to compressive sensing and
interpolation of incomplete images.},
Key = {fds257996}
}
@article{fds258002,
Author = {Chen, M and Zaas, A and Woods, C and Ginsburg, GS and Lucas, J and Dunson,
D and Carin, L},
Title = {Predicting Viral Infection From High-Dimensional Biomarker
Trajectories.},
Journal = {J Am Stat Assoc},
Volume = {106},
Number = {496},
Pages = {1259-1279},
Year = {2011},
Month = {January},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/jasa.2011.ap10611},
Abstract = {There is often interest in predicting an individual's latent
health status based on high-dimensional biomarkers that vary
over time. Motivated by time-course gene expression array
data that we have collected in two influenza challenge
studies performed with healthy human volunteers, we develop
a novel time-aligned Bayesian dynamic factor analysis
methodology. The time course trajectories in the gene
expressions are related to a relatively low-dimensional
vector of latent factors, which vary dynamically starting at
the latent initiation time of infection. Using a
nonparametric cure rate model for the latent initiation
times, we allow selection of the genes in the viral response
pathway, variability among individuals in infection times,
and a subset of individuals who are not infected. As we
demonstrate using held-out data, this statistical framework
allows accurate predictions of infected individuals in
advance of the development of clinical symptoms, without
labeled data and even when the number of biomarkers vastly
exceeds the number of individuals under study. Biological
interpretation of several of the inferred pathways (factors)
is provided.},
Doi = {10.1198/jasa.2011.ap10611},
Key = {fds258002}
}
@article{fds376572,
Author = {Armagan, A and Dunson, DB and Clyde, M},
Title = {Generalized beta mixtures of Gaussians},
Journal = {Advances in Neural Information Processing Systems 24: 25th
Annual Conference on Neural Information Processing Systems
2011, NIPS 2011},
Year = {2011},
Month = {January},
ISBN = {9781618395993},
Abstract = {In recent years, a rich variety of shrinkage priors have
been proposed that have great promise in addressing massive
regression problems. In general, these new priors can be
expressed as scale mixtures of normals, but have more
complex forms and better properties than traditional Cauchy
and double exponential priors. We first propose a new class
of normal scale mixtures through a novel generalized beta
distribution that encompasses many interesting priors as
special cases. This encompassing framework should prove
useful in comparing competing priors, considering properties
and revealing close connections. We then develop a class of
variational Bayes approximations through the new hierarchy
presented that will scale more efficiently to the types of
truly massive data sets that are now encountered
routinely.},
Key = {fds376572}
}
@article{fds371473,
Author = {Ren, L and Wang, Y and Dunson, D and Carin, L},
Title = {The kernel beta process},
Journal = {Advances in Neural Information Processing Systems 24: 25th
Annual Conference on Neural Information Processing Systems
2011, NIPS 2011},
Year = {2011},
Month = {January},
ISBN = {9781618395993},
Abstract = {A new Lévy process prior is proposed for an uncountable
collection of covariate-dependent feature-learning measures;
the model is called the kernel beta process (KBP). Available
covariates are handled efficiently via the kernel
construction, with covariates assumed observed with each
data sample ("customer"), and latent covariates learned for
each feature ("dish"). Each customer selects dishes from an
infinite buffet, in a manner analogous to the beta process,
with the added constraint that a customer first decides
probabilistically whether to "consider" a dish, based on the
distance in covariate space between the customer and dish.
If a customer does consider a particular dish, that dish is
then selected probabilistically as in the beta process. The
beta process is recovered as a limiting case of the KBP. An
efficient Gibbs sampler is developed for computations, and
state-of-the-art results are presented for image processing
and music analysis tasks.},
Key = {fds371473}
}
@article{fds371474,
Author = {Zhang, XX and Dunson, DB and Carin, L},
Title = {Hierarchical topic modeling for analysis of time-evolving
personal choices},
Journal = {Advances in Neural Information Processing Systems 24: 25th
Annual Conference on Neural Information Processing Systems
2011, NIPS 2011},
Year = {2011},
Month = {January},
ISBN = {9781618395993},
Abstract = {The nested Chinese restaurant process is extended to design
a nonparametric topic-model tree for representation of human
choices. Each tree path corresponds to a type of person, and
each node (topic) has a corresponding probability vector
over items that may be selected. The observed data are
assumed to have associated temporal covariates
(corresponding to the time at which choices are made), and
we wish to impose that with increasing time it is more
probable that topics deeper in the tree are utilized. This
structure is imposed by developing a new "change point"
stick-breaking model that is coupled with a Poisson and
product-of-gammas construction. To share topics across the
tree nodes, topic distributions are drawn from a Dirichlet
process. As a demonstration of this concept, we analyze real
data on course selections of undergraduate students at Duke
University, with the goal of uncovering and concisely
representing structure in the curriculum and in the
characteristics of the student body.},
Key = {fds371474}
}
@article{fds258030,
Author = {Shi, M and Dunson, DB},
Title = {Bayesian Variable Selection via Particle Stochastic
Search.},
Journal = {Statistics & probability letters},
Volume = {81},
Number = {2},
Pages = {283-291},
Year = {2011},
Month = {February},
ISSN = {0167-7152},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21278860},
Abstract = {We focus on Bayesian variable selection in regression
models. One challenge is to search the huge model space
adequately, while identifying high posterior probability
regions. In the past decades, the main focus has been on the
use of Markov chain Monte Carlo (MCMC) algorithms for these
purposes. In this article, we propose a new computational
approach based on sequential Monte Carlo (SMC), which we
refer to as particle stochastic search (PSS). We illustrate
PSS through applications to linear regression and probit
models.},
Doi = {10.1016/j.spl.2010.10.011},
Key = {fds258030}
}
@article{fds258055,
Author = {Chung, Y and Dunson, DB},
Title = {The local Dirichlet process.},
Journal = {Annals of the Institute of Statistical Mathematics},
Volume = {63},
Number = {1},
Pages = {59-80},
Year = {2011},
Month = {February},
ISSN = {0020-3157},
url = {http://dx.doi.org/10.1007/s10463-008-0218-9},
Abstract = {As a generalization of the Dirichlet process (DP) to allow
predictor dependence, we propose a local Dirichlet process
(lDP). The lDP provides a prior distribution for a
collection of random probability measures indexed by
predictors. This is accomplished by assigning stick-breaking
weights and atoms to random locations in a predictor space.
The probability measure at a given predictor value is then
formulated using the weights and atoms located in a
neighborhood about that predictor value. This construction
results in a marginal DP prior for the random measure at any
specific predictor value. Dependence is induced through
local sharing of random components. Theoretical properties
are considered and a blocked Gibbs sampler is proposed for
posterior computation in lDP mixture models. The methods are
illustrated using simulated examples and an epidemiologic
application.},
Doi = {10.1007/s10463-008-0218-9},
Key = {fds258055}
}
@article{fds257960,
Author = {Rodríguez, A and Dunson, DB},
Title = {Nonparametric Bayesian models through probit stick-breaking
processes.},
Journal = {Bayesian analysis},
Volume = {6},
Number = {1},
Pages = {145-178},
Year = {2011},
Month = {March},
ISSN = {1936-0975},
url = {http://dx.doi.org/10.1214/11-ba605},
Abstract = {We describe a novel class of Bayesian nonparametric priors
based on stick-breaking constructions where the weights of
the process are constructed as probit transformations of
normal random variables. We show that these priors are
extremely flexible, allowing us to generate a great variety
of models while preserving computational simplicity.
Particular emphasis is placed on the construction of rich
temporal and spatial processes, which are applied to two
problems in finance and ecology.},
Doi = {10.1214/11-ba605},
Key = {fds257960}
}
@article{fds257964,
Author = {Pati, D and Reich, BJ and Dunson, DB},
Title = {Bayesian geostatistical modelling with informative sampling
locations.},
Journal = {Biometrika},
Volume = {98},
Number = {1},
Pages = {35-48},
Year = {2011},
Month = {March},
ISSN = {0006-3444},
url = {http://dx.doi.org/10.1093/biomet/asq067},
Abstract = {We consider geostatistical models that allow the locations
at which data are collected to be informative about the
outcomes. A Bayesian approach is proposed, which models the
locations using a log Gaussian Cox process, while modelling
the outcomes conditionally on the locations as Gaussian with
a Gaussian process spatial random effect and adjustment for
the location intensity process. We prove posterior propriety
under an improper prior on the parameter controlling the
degree of informative sampling, demonstrating that the data
are informative. In addition, we show that the density of
the locations and mean function of the outcome process can
be estimated consistently under mild assumptions. The
methods show significant evidence of informative sampling
when applied to ozone data over Eastern U.S.A.},
Doi = {10.1093/biomet/asq067},
Key = {fds257964}
}
@article{fds258000,
Author = {Chen, M and Silva, J and Paisley, J and Wang, C and Dunson, D and Carin,
L},
Title = {Erratum: Compressive sensing on manifolds using a
nonparametric mixture of factor analyzers: Algorithm and
performance bounds (IEEE Transactions Signal Processing
(2011)) 58,12 (6140-6155))},
Journal = {IEEE Transactions on Signal Processing},
Volume = {59},
Number = {3},
Pages = {1329},
Publisher = {Institute of Electrical and Electronics Engineers
(IEEE)},
Year = {2011},
Month = {March},
ISSN = {1053-587X},
url = {http://dx.doi.org/10.1109/TSP.2011.2107810},
Doi = {10.1109/TSP.2011.2107810},
Key = {fds258000}
}
@article{fds258001,
Author = {Carin, L and Baraniuk, RG and Cevher, V and Dunson, D and Jordan, MI and Sapiro, G and Wakin, MB},
Title = {Learning Low-Dimensional Signal Models: A Bayesian approach
based on incomplete measurements.},
Journal = {IEEE signal processing magazine},
Volume = {28},
Number = {2},
Pages = {39-51},
Year = {2011},
Month = {March},
ISSN = {1053-5888},
url = {http://dx.doi.org/10.1109/msp.2010.939733},
Abstract = {Sampling, coding, and streaming even the most essential
data, e.g., in medical imaging and weather-monitoring
applications, produce a data deluge that severely stresses
the available analog-to-digital converter, communication
bandwidth, and digital-storage resources. Surprisingly,
while the ambient data dimension is large in many problems,
the relevant information in the data can reside in a much
lower dimensional space. © 2006 IEEE.},
Doi = {10.1109/msp.2010.939733},
Key = {fds258001}
}
@article{fds258038,
Author = {Reich, BJ and Fuentes, M and Dunson, DB},
Title = {Bayesian Spatial Quantile Regression.},
Journal = {Journal of the American Statistical Association},
Volume = {106},
Number = {493},
Pages = {6-20},
Year = {2011},
Month = {March},
ISSN = {0162-1459},
url = {http://hdl.handle.net/10161/2981 Duke open
access},
Abstract = {Tropospheric ozone is one of the six criteria pollutants
regulated by the United States Environmental Protection
Agency under the Clean Air Act and has been linked with
several adverse health effects, including mortality. Due to
the strong dependence on weather conditions, ozone may be
sensitive to climate change and there is great interest in
studying the potential effect of climate change on ozone,
and how this change may affect public health. In this paper
we develop a Bayesian spatial model to predict ozone under
different meteorological conditions, and use this model to
study spatial and temporal trends and to forecast ozone
concentrations under different climate scenarios. We develop
a spatial quantile regression model that does not assume
normality and allows the covariates to affect the entire
conditional distribution, rather than just the mean. The
conditional distribution is allowed to vary from
site-to-site and is smoothed with a spatial prior. For
extremely large datasets our model is computationally
infeasible, and we develop an approximate method. We apply
the approximate version of our model to summer ozone from
1997-2005 in the Eastern U.S., and use deterministic climate
models to project ozone under future climate conditions. Our
analysis suggests that holding all other factors fixed, an
increase in daily average temperature will lead to the
largest increase in ozone in the Industrial Midwest and
Northeast.},
Doi = {10.1198/jasa.2010.ap09237},
Key = {fds258038}
}
@article{fds257965,
Author = {Dzirasa, K and McGarity, DL and Bhattacharya, A and Kumar, S and Takahashi, JS and Dunson, D and McClung, CA and Nicolelis,
MAL},
Title = {Impaired limbic gamma oscillatory synchrony during
anxiety-related behavior in a genetic mouse model of bipolar
mania.},
Journal = {J Neurosci},
Volume = {31},
Number = {17},
Pages = {6449-6456},
Year = {2011},
Month = {April},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21525286},
Abstract = {Alterations in anxiety-related processing are observed
across many neuropsychiatric disorders, including bipolar
disorder. Though polymorphisms in a number of circadian
genes confer risk for this disorder, little is known about
how changes in circadian gene function disrupt brain
circuits critical for anxiety-related processing. Here we
characterize neurophysiological activity simultaneously
across five limbic brain areas (nucleus accumbens, amygdala,
prelimbic cortex, ventral hippocampus, and ventral tegmental
area) as wild-type (WT) mice and mice with a mutation in the
circadian gene, CLOCK (Clock-Δ19 mice) perform an elevated
zero maze task. In WT mice, basal limbic gamma oscillatory
synchrony observed before task performance predicted future
anxiety-related behaviors. Additionally, dynamic changes in
limbic gamma oscillatory synchrony were observed based on
the position of WT mice in the zero maze. Clock-Δ19 mice,
which displayed an increased propensity to enter the open
section of the elevated maze, showed profound deficits in
these anxiety-related circuit processes. Thus, our findings
link the anxiety-related behavioral deficits observed in
Clock-Δ19 mice with dysfunctional gamma oscillatory tuning
across limbic circuits and suggest that alterations in
limbic oscillatory circuit function induced by circadian
gene polymorphisms may contribute to the behavioral
manifestations seen in bipolar mania.},
Doi = {10.1523/JNEUROSCI.6144-10.2011},
Key = {fds257965}
}
@article{fds257967,
Author = {Page, GL and Dunson, DB},
Title = {Bayesian Local Contamination Models for Multivariate
Outliers.},
Journal = {Technometrics : a journal of statistics for the physical,
chemical, and engineering sciences},
Volume = {53},
Number = {2},
Pages = {152-162},
Year = {2011},
Month = {May},
ISSN = {0040-1706},
url = {http://dx.doi.org/10.1198/tech.2011.10041},
Abstract = {In studies where data are generated from multiple locations
or sources it is common for there to exist observations that
are quite unlike the majority. Motivated by the application
of establishing a reference value in an inter-laboratory
setting when outlying labs are present, we propose a local
contamination model that is able to accommodate unusual
multivariate realizations in a flexible way. The proposed
method models the process level of a hierarchical model
using a mixture with a parametric component and a possibly
nonparametric contamination. Much of the flexibility in the
methodology is achieved by allowing varying random subsets
of the elements in the lab-specific mean vectors to be
allocated to the contamination component. Computational
methods are developed and the methodology is compared to
three other possible approaches using a simulation study. We
apply the proposed method to a NIST/NOAA sponsored
inter-laboratory study which motivated the methodological
development.},
Doi = {10.1198/tech.2011.10041},
Key = {fds257967}
}
@article{fds258033,
Author = {Crandell, JL and Dunson, DB},
Title = {Posterior simulation across nonparametric models for
functional clustering},
Journal = {Sankhya B},
Volume = {73},
Number = {1},
Pages = {42-61},
Publisher = {Springer Nature},
Year = {2011},
Month = {May},
ISSN = {0972-7671},
url = {http://dx.doi.org/10.1007/s13571-011-0014-z},
Abstract = {By choosing a species sampling random probability measure
for the distribution of the basis coefficients, a general
class of nonparametric Bayesian methods for clustering of
functional data is developed. Allowing the basis functions
to be unknown, one faces the problem of posterior simulation
over a high-dimensional space of semiparametric models. To
address this problem, we propose a novel Metropolis-Hastings
algorithm for moving between models, with a nested
generalized collapsed Gibbs sampler for updating the model
parameters. Focusing on Dirichlet process priors for the
distribution of the basis coefficients in multivariate
linear spline models, we apply the approach to the problem
of clustering of hormone trajectories. This approach allows
the number of clusters and the shape of the trajectories
within each cluster to be unknown. The methodology can be
applied broadly to allow uncertainty in variable selection
in semiparametric Bayes hierarchical models.},
Doi = {10.1007/s13571-011-0014-z},
Key = {fds258033}
}
@article{fds257966,
Author = {Bhattacharya, A and Dunson, DB},
Title = {Sparse Bayesian infinite factor models.},
Journal = {Biometrika},
Volume = {98},
Number = {2},
Pages = {291-306},
Year = {2011},
Month = {June},
ISSN = {0006-3444},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23049129},
Abstract = {We focus on sparse modelling of high-dimensional covariance
matrices using Bayesian latent factor models. We propose a
multiplicative gamma process shrinkage prior on the factor
loadings which allows introduction of infinitely many
factors, with the loadings increasingly shrunk towards zero
as the column index increases. We use our prior on a
parameter-expanded loading matrix to avoid the order
dependence typical in factor analysis models and develop an
efficient Gibbs sampler that scales well as data
dimensionality increases. The gain in efficiency is achieved
by the joint conjugacy property of the proposed prior, which
allows block updating of the loadings matrix. We propose an
adaptive Gibbs sampler for automatically truncating the
infinite loading matrix through selection of the number of
important factors. Theoretical results are provided on the
support of the prior and truncation approximation bounds. A
fast algorithm is proposed to produce approximate Bayes
estimates. Latent factor regression methods are developed
for prediction and variable selection in applications with
high-dimensional correlated predictors. Operating
characteristics are assessed through simulation studies, and
the approach is applied to predict survival times from gene
expression data.},
Doi = {10.1093/biomet/asr013},
Key = {fds257966}
}
@article{fds258035,
Author = {Liu, F and Dunson, D and Zou, F},
Title = {High-dimensional variable selection in meta-analysis for
censored data.},
Journal = {Biometrics},
Volume = {67},
Number = {2},
Pages = {504-512},
Year = {2011},
Month = {June},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-0420.2010.01466.x},
Abstract = {This article considers the problem of selecting predictors
of time to an event from a high-dimensional set of candidate
predictors using data from multiple studies. As an
alternative to the current multistage testing approaches, we
propose to model the study-to-study heterogeneity explicitly
using a hierarchical model to borrow strength. Our method
incorporates censored data through an accelerated failure
time model. Using a carefully formulated prior
specification, we develop a fast approach to predictor
selection and shrinkage estimation for high-dimensional
predictors. For model fitting, we develop a Monte Carlo
expectation maximization (MC-EM) algorithm to accommodate
censored data. The proposed approach, which is related to
the relevance vector machine (RVM), relies on maximum a
posteriori estimation to rapidly obtain a sparse estimate.
As for the typical RVM, there is an intrinsic thresholding
property in which unimportant predictors tend to have their
coefficients shrunk to zero. We compare our method with some
commonly used procedures through simulation studies. We also
illustrate the method using the gene expression barcode data
from three breast cancer studies.},
Doi = {10.1111/j.1541-0420.2010.01466.x},
Key = {fds258035}
}
@article{fds257968,
Author = {Xing, C and Dunson, DB},
Title = {Bayesian inference for genomic data integration reduces
misclassification rate in predicting protein-protein
interactions.},
Journal = {PLoS computational biology},
Volume = {7},
Number = {7},
Pages = {e1002110},
Year = {2011},
Month = {July},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21829334},
Abstract = {Protein-protein interactions (PPIs) are essential to most
fundamental cellular processes. There has been increasing
interest in reconstructing PPIs networks. However, several
critical difficulties exist in obtaining reliable
predictions. Noticeably, false positive rates can be as high
as >80%. Error correction from each generating source can be
both time-consuming and inefficient due to the difficulty of
covering the errors from multiple levels of data processing
procedures within a single test. We propose a novel Bayesian
integration method, deemed nonparametric Bayes ensemble
learning (NBEL), to lower the misclassification rate (both
false positives and negatives) through automatically
up-weighting data sources that are most informative, while
down-weighting less informative and biased sources.
Extensive studies indicate that NBEL is significantly more
robust than the classic naïve Bayes to unreliable,
error-prone and contaminated data. On a large human data set
our NBEL approach predicts many more PPIs than naïve Bayes.
This suggests that previous studies may have large numbers
of not only false positives but also false negatives. The
validation on two human PPIs datasets having high quality
supports our observations. Our experiments demonstrate that
it is feasible to predict high-throughput PPIs
computationally with substantially reduced false positives
and false negatives. The ability of predicting large numbers
of PPIs both reliably and automatically may inspire people
to use computational approaches to correct data errors in
general, and may speed up PPIs prediction with high quality.
Such a reliable prediction may provide a solid platform to
other studies such as protein functions prediction and roles
of PPIs in disease susceptibility.},
Doi = {10.1371/journal.pcbi.1002110},
Key = {fds257968}
}
@article{fds257969,
Author = {Armagan, A and Dunson, D},
Title = {Sparse variational analysis of linear mixed models for large
data sets},
Journal = {Statistics and Probability Letters},
Volume = {81},
Number = {8},
Pages = {1056-1062},
Publisher = {Elsevier BV},
Year = {2011},
Month = {August},
ISSN = {0167-7152},
url = {http://dx.doi.org/10.1016/j.spl.2011.02.029},
Abstract = {It is increasingly common to be faced with longitudinal or
multi-level data sets that have large numbers of predictors
and/or a large sample size. Current methods of fitting and
inference for mixed effects models tend to perform poorly in
such settings. When there are many variables, it is
appealing to allow uncertainty in subset selection and to
obtain a sparse characterization of the data. Bayesian
methods are available to address these goals using Markov
chain Monte Carlo (MCMC), but MCMC is very computationally
expensive and can be infeasible in large p and/or large n
problems. As a fast approximate Bayes solution, we recommend
a novel approximation to the posterior relying on
variational methods. Variational methods are used to
approximate the posterior of the parameters in a
decomposition of the variance components, with priors chosen
to obtain a sparse solution that allows selection of random
effects. The method is evaluated through a simulation study,
and applied to an epidemiological application. © 2011
Elsevier B.V.},
Doi = {10.1016/j.spl.2011.02.029},
Key = {fds257969}
}
@article{fds257993,
Author = {Zhou, M and Yang, H and Sapiro, G and Dunson, D and Carin,
L},
Title = {Covariate-dependent dictionary learning and sparse
coding},
Journal = {ICASSP, IEEE International Conference on Acoustics, Speech
and Signal Processing - Proceedings},
Pages = {5824-5827},
Publisher = {IEEE},
Year = {2011},
Month = {August},
ISSN = {1520-6149},
url = {http://dx.doi.org/10.1109/ICASSP.2011.5947685},
Abstract = {A dependent hierarchical beta process (dHBP) is developed as
a prior for data that may be represented in terms of a
sparse set of latent features (dictionary elements), with
covariate-dependent feature usage. The dHBP is applicable to
general covariates and data models, imposing that signals
with similar covariates are likely to be manifested in terms
of similar features. As an application, we consider the
simultaneous sparse modeling of multiple images, with the
covariate of a given image linked to its similarity to all
other images (as applied in manifold learning). Efficient
inference is performed using hybrid Gibbs,
Metropolis-Hastings and slice sampling. © 2011
IEEE.},
Doi = {10.1109/ICASSP.2011.5947685},
Key = {fds257993}
}
@article{fds257962,
Author = {Yang, H and O'Brien, S and Dunson, DB},
Title = {Nonparametric Bayes Stochastically Ordered Latent Class
Models.},
Journal = {J Am Stat Assoc},
Volume = {106},
Number = {495},
Pages = {807-817},
Year = {2011},
Month = {September},
ISSN = {0162-1459},
url = {http://www.ncbi.nlm.nih.gov/pubmed/22505787},
Abstract = {Latent class models (LCMs) are used increasingly for
addressing a broad variety of problems, including sparse
modeling of multivariate and longitudinal data, model-based
clustering, and flexible inferences on predictor effects.
Typical frequentist LCMs require estimation of a single
finite number of classes, which does not increase with the
sample size, and have a well-known sensitivity to parametric
assumptions on the distributions within a class. Bayesian
nonparametric methods have been developed to allow an
infinite number of classes in the general population, with
the number represented in a sample increasing with sample
size. In this article, we propose a new nonparametric Bayes
model that allows predictors to flexibly impact the
allocation to latent classes, while limiting sensitivity to
parametric assumptions by allowing class-specific
distributions to be unknown subject to a stochastic ordering
constraint. An efficient MCMC algorithm is developed for
posterior computation. The methods are validated using
simulation studies and applied to the problem of ranking
medical procedures in terms of the distribution of patient
morbidity.},
Doi = {10.1198/jasa.2011.ap10058},
Key = {fds257962}
}
@article{fds257970,
Author = {Wang, L and Dunson, DB},
Title = {Semiparametric bayes' proportional odds models for current
status data with underreporting.},
Journal = {Biometrics},
Volume = {67},
Number = {3},
Pages = {1111-1118},
Year = {2011},
Month = {September},
ISSN = {0006-341X},
url = {http://dx.doi.org/10.1111/j.1541-0420.2010.01532.x},
Abstract = {Current status data are a type of interval-censored event
time data in which all the individuals are either left or
right censored. For example, our motivation is drawn from a
cross-sectional study, which measured whether or not fibroid
onset had occurred by the age of an ultrasound exam for each
woman. We propose a semiparametric Bayesian proportional
odds model in which the baseline event time distribution is
estimated nonparametrically by using adaptive monotone
splines in a logistic regression model and the potential
risk factors are included in the parametric part of the mean
structure. The proposed approach has the advantage of being
straightforward to implement using a simple and efficient
Gibbs sampler, whereas alternative semiparametric Bayes'
event time models encounter problems for current status
data. The model is generalized to allow systematic
underreporting in a subset of the data, and the methods are
applied to an epidemiologic study of uterine
fibroids.},
Doi = {10.1111/j.1541-0420.2010.01532.x},
Key = {fds257970}
}
@article{fds257972,
Author = {Wang, L and Dunson, DB},
Title = {Bayesian isotonic density regression.},
Journal = {Biometrika},
Volume = {98},
Number = {3},
Pages = {537-551},
Year = {2011},
Month = {September},
ISSN = {0006-3444},
url = {http://dx.doi.org/10.1093/biomet/asr025},
Abstract = {Density regression models allow the conditional distribution
of the response given predictors to change flexibly over the
predictor space. Such models are much more flexible than
nonparametric mean regression models with nonparametric
residual distributions, and are well supported in many
applications. A rich variety of Bayesian methods have been
proposed for density regression, but it is not clear whether
such priors have full support so that any true
data-generating model can be accurately approximated. This
article develops a new class of density regression models
that incorporate stochastic-ordering constraints which are
natural when a response tends to increase or decrease
monotonely with a predictor. Theory is developed showing
large support. Methods are developed for hypothesis testing,
with posterior computation relying on a simple Gibbs
sampler. Frequentist properties are illustrated in a
simulation study, and an epidemiology application is
considered.},
Doi = {10.1093/biomet/asr025},
Key = {fds257972}
}
@article{fds257961,
Author = {Hannah, LA and Dunson, DB},
Title = {Approximate dynamic programming for storage
problems},
Journal = {Proceedings of the 28th International Conference on Machine
Learning, ICML 2011},
Pages = {337-344},
Year = {2011},
Month = {October},
Abstract = {Storage problems are an important subclass of stochastic
control problems. This paper presents a new method,
approximate dynamic programming for storage, to solve
storage problems with continuous, convex decision sets.
Unlike other solution procedures, ADPS allows math
programming to be used to make decisions each time period,
even in the presence of large state variables. We test ADPS
on the day ahead wind commitment problem with storage.
Copyright 2011 by the author(s)/owner(s).},
Key = {fds257961}
}
@article{fds257994,
Author = {Chen, B and Polatkan, G and Sapiro, G and Dunson, DB and Carin,
L},
Title = {The hierarchical beta process for convolutional factor
analysis and deep learning},
Journal = {Proceedings of the 28th International Conference on Machine
Learning, ICML 2011},
Pages = {361-368},
Year = {2011},
Month = {October},
Abstract = {A convolutional factor-analysis model is developed, with the
number of filters (factors) inferred via the beta process
(BP) and hierarchical BP, for single-task and multi-task
learning, respectively. The computation of the model
parameters is implemented within a Bayesian setting,
employing Gibbs sampling; we explicitly exploit the
convolutional nature of the expansion to accelerate
computations. The model is used in a multi-level ("deep")
analysis of general data, with specific results presented
for image-processing data sets, e.g., classification.
Copyright 2011 by the author(s)/owner(s).},
Key = {fds257994}
}
@article{fds257957,
Author = {Gordon, GJ and Dunson, D},
Title = {Preface to the proceedings of AISTATS 2011},
Journal = {Journal of Machine Learning Research},
Volume = {15},
Pages = {1-2},
Year = {2011},
Month = {December},
ISSN = {1532-4435},
Key = {fds257957}
}
@article{fds257971,
Author = {Canale, A and Dunson, DB},
Title = {Bayesian Kernel Mixtures for Counts.},
Journal = {Journal of the American Statistical Association},
Volume = {106},
Number = {496},
Pages = {1528-1539},
Year = {2011},
Month = {December},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/jasa.2011.tm10552},
Abstract = {Although Bayesian nonparametric mixture models for
continuous data are well developed, there is a limited
literature on related approaches for count data. A common
strategy is to use a mixture of Poissons, which
unfortunately is quite restrictive in not accounting for
distributions having variance less than the mean. Other
approaches include mixing multinomials, which requires
finite support, and using a Dirichlet process prior with a
Poisson base measure, which does not allow smooth deviations
from the Poisson. As a broad class of alternative models, we
propose to use nonparametric mixtures of rounded continuous
kernels. An efficient Gibbs sampler is developed for
posterior computation, and a simulation study is performed
to assess performance. Focusing on the rounded Gaussian
case, we generalize the modeling framework to account for
multivariate count data, joint modeling with continuous and
categorical variables, and other complications. The methods
are illustrated through applications to a developmental
toxicity study and marketing data. This article has
supplementary material online.},
Doi = {10.1198/jasa.2011.tm10552},
Key = {fds257971}
}
@article{fds257997,
Author = {Zhang, XX and Dunson, DB and Carin, L},
Title = {Hierarchical topic modeling for analysis of time-evolving
personal choices},
Journal = {Advances in Neural Information Processing Systems 24: 25th
Annual Conference on Neural Information Processing Systems
2011, NIPS 2011},
Year = {2011},
Month = {December},
Abstract = {The nested Chinese restaurant process is extended to design
a nonparametric topic-model tree for representation of human
choices. Each tree path corresponds to a type of person, and
each node (topic) has a corresponding probability vector
over items that may be selected. The observed data are
assumed to have associated temporal covariates
(corresponding to the time at which choices are made), and
we wish to impose that with increasing time it is more
probable that topics deeper in the tree are utilized. This
structure is imposed by developing a new "change point"
stick-breaking model that is coupled with a Poisson and
product-of-gammas construction. To share topics across the
tree nodes, topic distributions are drawn from a Dirichlet
process. As a demonstration of this concept, we analyze real
data on course selections of undergraduate students at Duke
University, with the goal of uncovering and concisely
representing structure in the curriculum and in the
characteristics of the student body.},
Key = {fds257997}
}
@article{fds257998,
Author = {Ren, L and Wang, Y and Dunson, D and Carin, L},
Title = {The kernel beta process},
Journal = {Advances in Neural Information Processing Systems 24: 25th
Annual Conference on Neural Information Processing Systems
2011, NIPS 2011},
Year = {2011},
Month = {December},
Abstract = {A new Lévy process prior is proposed for an uncountable
collection of covariate-dependent feature-learning measures;
the model is called the kernel beta process (KBP). Available
covariates are handled efficiently via the kernel
construction, with covariates assumed observed with each
data sample ("customer"), and latent covariates learned for
each feature ("dish"). Each customer selects dishes from an
infinite buffet, in a manner analogous to the beta process,
with the added constraint that a customer first decides
probabilistically whether to "consider" a dish, based on the
distance in covariate space between the customer and dish.
If a customer does consider a particular dish, that dish is
then selected probabilistically as in the beta process. The
beta process is recovered as a limiting case of the KBP. An
efficient Gibbs sampler is developed for computations, and
state-of-the-art results are presented for image processing
and music analysis tasks.},
Key = {fds257998}
}
@article{fds257999,
Author = {Zhou, M and Carin, L and Yang, H and Dunson, D and Sapiro,
G},
Title = {Dependent hierarchical beta process for image interpolation
and denoising},
Journal = {Journal of Machine Learning Research},
Volume = {15},
Pages = {883-891},
Year = {2011},
Month = {December},
ISSN = {1532-4435},
Abstract = {A dependent hierarchical beta process (dHBP) is developed as
a prior for data that may be represented in terms of a
sparse set of latent features, with covariate-dependent
feature usage. The dHBP is applicable to general covariates
and data models, imposing that signals with similar
covariates are likely to be manifested in terms of similar
features. Coupling the dHBP with the Bernoulli process, and
upon marginalizing out the dHBP, the model may be
interpreted as a covariate-dependent hierarchical Indian
buffet process. As applications, we consider interpolation
and denoising of an image, with covariates defined by the
location of image patches within an image. Two types of
noise models are considered: (i) typical white Gaussian
noise; and (ii) spiky noise of arbitrary amplitude,
distributed uniformly at random. In these examples, the
features correspond to the atoms of a dictionary, learned
based upon the data under test (without a priori training
data). State-of-the-art performance is demonstrated, with
efficient inference using hybrid Gibbs, Metropolis-Hastings
and slice sampling. Copyright 2011 by the
authors.},
Key = {fds257999}
}
@article{fds258003,
Author = {Zhou, M and Chen, H and Paisley, J and Ren, L and Li, L and Xing, Z and Dunson, D and Sapiro, G and Carin, L},
Title = {Nonparametric Bayesian dictionary learning for analysis of
noisy and incomplete images.},
Journal = {IEEE transactions on image processing : a publication of the
IEEE Signal Processing Society},
Volume = {21},
Number = {1},
Pages = {130-144},
Year = {2012},
Month = {January},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21693421},
Abstract = {Nonparametric Bayesian methods are considered for recovery
of imagery based upon compressive, incomplete, and/or noisy
measurements. A truncated beta-Bernoulli process is employed
to infer an appropriate dictionary for the data under test
and also for image recovery. In the context of compressive
sensing, significant improvements in image recovery are
manifested using learned dictionaries, relative to using
standard orthonormal image expansions. The
compressive-measurement projections are also optimized for
the learned dictionary. Additionally, we consider simpler
(incomplete) measurements, defined by measuring a subset of
image pixels, uniformly selected at random. Spatial
interrelationships within imagery are exploited through use
of the Dirichlet and probit stick-breaking processes.
Several example results are presented, with comparisons to
other methods in the literature.},
Doi = {10.1109/tip.2011.2160072},
Key = {fds258003}
}
@article{fds258045,
Author = {Dunson, DB and Xing, C},
Title = {Nonparametric Bayes Modeling of Multivariate Categorical
Data.},
Journal = {Journal of the American Statistical Association},
Volume = {104},
Number = {487},
Pages = {1042-1051},
Year = {2012},
Month = {January},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1198/jasa.2009.tm08439},
Abstract = {Modeling of multivariate unordered categorical (nominal)
data is a challenging problem, particularly in high
dimensions and cases in which one wishes to avoid strong
assumptions about the dependence structure. Commonly used
approaches rely on the incorporation of latent Gaussian
random variables or parametric latent class models. The goal
of this article is to develop a nonparametric Bayes
approach, which defines a prior with full support on the
space of distributions for multiple unordered categorical
variables. This support condition ensures that we are not
restricting the dependence structure a priori. We show this
can be accomplished through a Dirichlet process mixture of
product multinomial distributions, which is also a
convenient form for posterior computation. Methods for
nonparametric testing of violations of independence are
proposed, and the methods are applied to model positional
dependence within transcription factor binding
motifs.},
Doi = {10.1198/jasa.2009.tm08439},
Key = {fds258045}
}
@article{fds323139,
Author = {Carin, L and Hero, A and Lucas, J and Dunson, D and Chen, M and Heñao, R and Tibau-Puig, A and Zaas, A and Woods, CW and Ginsburg,
GS},
Title = {High-Dimensional Longitudinal Genomic Data: An analysis used
for monitoring viral infections.},
Journal = {IEEE Signal Process Mag},
Volume = {29},
Number = {1},
Pages = {108-123},
Year = {2012},
Month = {January},
url = {http://dx.doi.org/10.1109/MSP.2011.943009},
Doi = {10.1109/MSP.2011.943009},
Key = {fds323139}
}
@article{fds322562,
Author = {Fyshe, A and Fox, E and Dunson, D and Mitchell, T},
Title = {Hierarchical latent dictionaries for models of brain
activation},
Journal = {Journal of Machine Learning Research},
Volume = {22},
Pages = {409-421},
Year = {2012},
Month = {January},
Abstract = {In this work, we propose a hierarchical latent dictionary
approach to estimate the timevarying mean and covariance of
a process for which we have only limited noisy samples. We
fully leverage the limited sample size and redundancy in
sensor measurements by transferring knowledge through a
hierarchy of lower dimensional latent processes. As a case
study, we utilize Magnetoencephalography (MEG) recordings of
brain activity to identify the word being viewed by a human
subject. Specifically, we identify the word category for a
single noisy MEG recording, when only given limited noisy
samples on which to train.},
Key = {fds322562}
}
@article{fds323268,
Author = {Zhou, M and Hannah, LA and Dunson, DB and Carin, L},
Title = {Beta-negative binomial process and poisson factor
analysis},
Journal = {Journal of Machine Learning Research},
Volume = {22},
Pages = {1462-1471},
Year = {2012},
Month = {January},
Abstract = {A beta-negative binomial (BNB) process is proposed, leading
to a beta-gamma-Poisson process, which may be viewed as a
"multiscoop" generalization of the beta-Bernoulli process.
The BNB process is augmented into a beta-gamma-gamma-Poisson
hierarchical structure, and applied as a nonparametric
Bayesian prior for an infinite Poisson factor analysis
model. A finite approximation for the beta process Lévy
random measure is constructed for convenient implementation.
Efficient MCMC computations are performed with data
augmentation and marginalization techniques. Encouraging
results are shown on document count matrix
factorization.},
Key = {fds323268}
}
@article{fds257973,
Author = {Bhattacharya, A and Dunson, DB},
Title = {Simplex Factor Models for Multivariate Unordered Categorical
Data.},
Journal = {Journal of the American Statistical Association},
Volume = {107},
Number = {497},
Pages = {362-377},
Year = {2012},
Month = {March},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2011.646934},
Abstract = {Gaussian latent factor models are routinely used for
modeling of dependence in continuous, binary, and ordered
categorical data. For unordered categorical variables,
Gaussian latent factor models lead to challenging
computation and complex modeling structures. As an
alternative, we propose a novel class of simplex factor
models. In the single-factor case, the model treats the
different categorical outcomes as independent with unknown
marginals. The model can characterize flexible dependence
structures parsimoniously with few factors, and as factors
are added, any multivariate categorical data distribution
can be accurately approximated. Using a Bayesian approach
for computation and inferences, a Markov chain Monte Carlo
(MCMC) algorithm is proposed that scales well with
increasing dimension, with the number of factors treated as
unknown. We develop an efficient proposal for updating the
base probability vector in hierarchical Dirichlet models.
Theoretical properties are described, and we evaluate the
approach through simulation examples. Applications are
described for modeling dependence in nucleotide sequences
and prediction from high-dimensional categorical
features.},
Doi = {10.1080/01621459.2011.646934},
Key = {fds257973}
}
@article{fds257975,
Author = {Bhattacharya, A and Dunson, DB},
Title = {Strong consistency of nonparametric Bayes density estimation
on compact metric spaces with applications to specific
manifolds.},
Journal = {Annals of the Institute of Statistical Mathematics},
Volume = {64},
Number = {4},
Pages = {687-714},
Year = {2012},
Month = {August},
ISSN = {0020-3157},
url = {http://dx.doi.org/10.1007/s10463-011-0341-x},
Abstract = {This article considers a broad class of kernel mixture
density models on compact metric spaces and manifolds.
Following a Bayesian approach with a nonparametric prior on
the location mixing distribution, sufficient conditions are
obtained on the kernel, prior and the underlying space for
strong posterior consistency at any continuous density. The
prior is also allowed to depend on the sample size n and
sufficient conditions are obtained for weak and strong
consistency. These conditions are verified on compact
Euclidean spaces using multivariate Gaussian kernels, on the
hypersphere using a von Mises-Fisher kernel and on the
planar shape space using complex Watson kernels.},
Doi = {10.1007/s10463-011-0341-x},
Key = {fds257975}
}
@article{fds257974,
Author = {Bhattacharya, A and Dunson, D},
Title = {Nonparametric Bayes classification and hypothesis testing on
manifolds},
Journal = {Journal of Multivariate Analysis},
Volume = {111},
Pages = {1-19},
Publisher = {Elsevier BV},
Year = {2012},
Month = {October},
ISSN = {0047-259X},
url = {http://dx.doi.org/10.1016/j.jmva.2012.02.020},
Abstract = {Our first focus is prediction of a categorical response
variable using features that lie on a general manifold. For
example, the manifold may correspond to the surface of a
hypersphere. We propose a general kernel mixture model for
the joint distribution of the response and predictors, with
the kernel expressed in product form and dependence induced
through the unknown mixing measure. We provide simple
sufficient conditions for large support and weak and strong
posterior consistency in estimating both the joint
distribution of the response and predictors and the
conditional distribution of the response. Focusing on a
Dirichlet process prior for the mixing measure, these
conditions hold using von Mises-Fisher kernels when the
manifold is the unit hypersphere. In this case, Bayesian
methods are developed for efficient posterior computation
using slice sampling. Next we develop Bayesian nonparametric
methods for testing whether there is a difference in
distributions between groups of observations on the manifold
having unknown densities. We prove consistency of the Bayes
factor and develop efficient computational methods for its
calculation. The proposed classification and testing methods
are evaluated using simulation examples and applied to
spherical data applications. © 2012 Elsevier
Inc.},
Doi = {10.1016/j.jmva.2012.02.020},
Key = {fds257974}
}
@article{fds257977,
Author = {Hua, Z and Dunson, DB and Gilmore, JH and Styner, MA and Zhu,
H},
Title = {Semiparametric Bayesian local functional models for
diffusion tensor tract statistics.},
Journal = {NeuroImage},
Volume = {63},
Number = {1},
Pages = {460-474},
Year = {2012},
Month = {October},
ISSN = {1053-8119},
url = {http://dx.doi.org/10.1016/j.neuroimage.2012.06.027},
Abstract = {We propose a semiparametric Bayesian local functional model
(BFM) for the analysis of multiple diffusion properties
(e.g., fractional anisotropy) along white matter fiber
bundles with a set of covariates of interest, such as age
and gender. BFM accounts for heterogeneity in the shape of
the fiber bundle diffusion properties among subjects, while
allowing the impact of the covariates to vary across
subjects. A nonparametric Bayesian LPP2 prior facilitates
global and local borrowings of information among subjects,
while an infinite factor model flexibly represents
low-dimensional structure. Local hypothesis testing and
credible bands are developed to identify fiber segments,
along which multiple diffusion properties are significantly
associated with covariates of interest, while controlling
for multiple comparisons. Moreover, BFM naturally group
subjects into more homogeneous clusters. Posterior
computation proceeds via an efficient Markov chain Monte
Carlo algorithm. A simulation study is performed to evaluate
the finite sample performance of BFM. We apply BFM to
investigate the development of white matter diffusivities
along the splenium of the corpus callosum tract and the
right internal capsule tract in a clinical study of
neurodevelopment in new born infants.},
Doi = {10.1016/j.neuroimage.2012.06.027},
Key = {fds257977}
}
@article{fds258025,
Author = {Hannah, LA and Dunson, DB},
Title = {Ensemble methods for convex regression with applications to
geometric programming based circuit design},
Journal = {Proceedings of the 29th International Conference on Machine
Learning, ICML 2012},
Volume = {1},
Pages = {369-376},
Year = {2012},
Month = {October},
Abstract = {Convex regression is a promising area for bridging
statistical estimation and deterministic convex
optimization. New piecewise linear convex regression methods
(Hannah and Dunson, 2011; Magnani and Boyd, 2009) are fast
and scalable, but can have instability when used to
approximate constraints or objective functions for
optimization. Ensemble methods, like bagging, smearing and
random partitioning, can alleviate this problem and maintain
the theoretical properties of the underlying estimator. We
empirically examine the performance of ensemble methods for
prediction and optimization, and then apply them to device
modeling and constraint approximation for geometric
programming based circuit design. Copyright 2012 by the
author(s)/owner(s).},
Key = {fds258025}
}
@article{fds258026,
Author = {Shterev, ID and Dunson, DB},
Title = {Bayesian watermark attacks},
Journal = {Proceedings of the 29th International Conference on Machine
Learning, ICML 2012},
Volume = {1},
Pages = {695-702},
Year = {2012},
Month = {October},
Abstract = {This paper presents an application of statistical machine
learning to the field of water-marking. We propose a new
attack model on additive spread-spectrum watermarking
systems. The proposed attack is based on Bayesian
statistics. We consider the scenario in which a watermark
signal is repeatedly embedded in specific, possibly chosen
based on a secret message bitstream, segments (signals) of
the host data. The host signal can represent a patch of
pixels from an image or a video frame. We propose a
probabilistic model that infers the embedded message
bit-stream and watermark signal, directly from the
watermarked data, without access to the decoder. We develop
an efficient Markov chain Monte Carlo sampler for updating
the model parameters from their conjugate full conditional
posteriors. We also provide a variational Bayesian solution,
which further increases the convergence speed of the
algorithm. Experiments with synthetic and real image signals
demonstrate that the attack model is able to correctly infer
a large part of the message bitstream and obtain a very
accurate estimate of the watermark signal. Copyright 2012 by
the author(s)/owner(s).},
Key = {fds258026}
}
@article{fds258027,
Author = {Zhou, M and Li, L and Dunson, D and Carin, L},
Title = {Lognormal and gamma mixed negative binomial
regression},
Journal = {Proceedings of the 29th International Conference on Machine
Learning, ICML 2012},
Volume = {2},
Pages = {1343-1350},
Year = {2012},
Month = {October},
url = {http://hdl.handle.net/10161/8954 Duke open
access},
Abstract = {In regression analysis of counts, a lack of simple and
efficient algorithms for posterior computation has made
Bayesian approaches appear unattractive and thus
underdeveloped. We propose a lognormal and gamma mixed
negative binomial (NB) regression model for counts, and
present efficient closed-form Bayesian inference; unlike
conventional Poisson models, the proposed approach has two
free parameters to include two different kinds of random
effects, and allows the incorporation of prior information,
such as sparsity in the regression coefficients. By placing
a gamma distribution prior on the NB dispersion parameter r,
and connecting a log-normal distribution prior with the
logit of the NB probability parameter p, efficient Gibbs
sampling and variational Bayes inference are both developed.
The closed-form updates are obtained by exploiting
conditional conjugacy via both a compound Poisson
representation and a Polya-Gamma distribution based data
augmentation approach. The proposed Bayesian inference can
be implemented routinely, while being easily generalizable
to more complex settings involving multivariate dependence
structures. The algorithms are illustrated using real
examples. Copyright 2012 by the author(s)/owner(s).},
Key = {fds258027}
}
@article{fds257875,
Author = {Petralia, F and Rao, V and Dunson, DB},
Title = {Repulsive mixtures},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {3},
Pages = {1889-1897},
Year = {2012},
Month = {December},
ISSN = {1049-5258},
Abstract = {Discrete mixtures are used routinely in broad sweeping
applications ranging from unsupervised settings to fully
supervised multi-task learning. Indeed, finite mixtures and
infinite mixtures, relying on Dirichlet processes and
modifications, have become a standard tool. One important
issue that arises in using discrete mixtures is low
separation in the components; in particular, different
components can be introduced that are very similar and hence
redundant. Such redundancy leads to too many clusters that
are too similar, degrading performance in unsupervised
learning and leading to computational problems and an
unnecessarily complex model in supervised settings.
Redundancy can arise in the absence of a penalty on
components placed close together even when a Bayesian
approach is used to learn the number of components. To solve
this problem, we propose a novel prior that generates
components from a repulsive process, automatically
penalizing redundant components. We characterize this
repulsive prior theoretically and propose a Markov chain
Monte Carlo sampling algorithm for posterior computation.
The methods are illustrated using synthetic examples and an
iris data set.},
Key = {fds257875}
}
@article{fds257876,
Author = {Fox, EB and Dunson, DB},
Title = {Multiresolution Gaussian processes},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {1},
Pages = {737-745},
Year = {2012},
Month = {December},
ISSN = {1049-5258},
Abstract = {We propose a multiresolution Gaussian process to capture
long-range, non-Markovian dependencies while allowing for
abrupt changes and non-stationarity. The multiresolution GP
hierarchically couples a collection of smooth GPs, each
defined over an element of a random nested partition.
Long-range dependencies are captured by the top-level GP
while the partition points define the abrupt changes. Due to
the inherent conjugacy of the GPs, one can analytically
marginalize the GPs and compute the marginal likelihood of
the observations given the partition tree. This property
allows for efficient inference of the partition itself, for
which we employ graph-theoretic techniques. We apply the
multiresolution GP to the analysis of magnetoencephalography
(MEG) recordings of brain activity.},
Key = {fds257876}
}
@article{fds257878,
Author = {Ding, M and He, L and Dunson, D and Carin, L},
Title = {Nonparametric Bayesian Segmentation of a Multivariate
Inhomogeneous Space-Time Poisson Process.},
Journal = {Bayesian analysis},
Volume = {7},
Number = {4},
Pages = {813-840},
Year = {2012},
Month = {December},
ISSN = {1931-6690},
url = {http://gateway.webofknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcApp=PARTNER_APP&SrcAuth=LinksAMR&KeyUT=WOS:000311975100005&DestLinkType=FullRecord&DestApp=ALL_WOS&UsrCustomerID=47d3190e77e5a3a53558812f597b0b92},
Abstract = {A nonparametric Bayesian model is proposed for segmenting
time-evolving multivariate spatial point process data. An
inhomogeneous Poisson process is assumed, with a logistic
stick-breaking process (LSBP) used to encourage
piecewise-constant spatial Poisson intensities. The LSBP
explicitly favors spatially contiguous segments, and infers
the number of segments based on the observed data. The
temporal dynamics of the segmentation and of the Poisson
intensities are modeled with exponential correlation in
time, implemented in the form of a first-order
autoregressive model for uniformly sampled discrete data,
and via a Gaussian process with an exponential kernel for
general temporal sampling. We consider and compare two
different inference techniques: a Markov chain Monte Carlo
sampler, which has relatively high computational complexity;
and an approximate and efficient variational Bayesian
analysis. The model is demonstrated with a simulated example
and a real example of space-time crime events in Cincinnati,
Ohio, USA.},
Doi = {10.1214/12-ba727},
Key = {fds257878}
}
@article{fds257881,
Author = {Love, C and Sun, Z and Jima, D and Li, G and Zhang, J and Miles, R and Richards, KL and Dunphy, CH and Choi, WWL and Srivastava, G and Lugar,
PL and Rizzieri, DA and Lagoo, AS and Bernal-Mizrachi, L and Mann, KP and Flowers, CR and Naresh, KN and Evens, AM and Chadburn, A and Gordon, LI and Czader, MB and Gill, JI and Hsi, ED and Greenough, A and Moffitt, AB and McKinney, M and Banerjee, A and Grubor, V and Levy, S and Dunson, DB and Dave, SS},
Title = {The genetic landscape of mutations in Burkitt
lymphoma.},
Journal = {Nat Genet},
Volume = {44},
Number = {12},
Pages = {1321-1325},
Year = {2012},
Month = {December},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23143597},
Abstract = {Burkitt lymphoma is characterized by deregulation of MYC,
but the contribution of other genetic mutations to the
disease is largely unknown. Here, we describe the first
completely sequenced genome from a Burkitt lymphoma tumor
and germline DNA from the same affected individual. We
further sequenced the exomes of 59 Burkitt lymphoma tumors
and compared them to sequenced exomes from 94 diffuse large
B-cell lymphoma (DLBCL) tumors. We identified 70 genes that
were recurrently mutated in Burkitt lymphomas, including
ID3, GNA13, RET, PIK3R1 and the SWI/SNF genes ARID1A and
SMARCA4. Our data implicate a number of genes in cancer for
the first time, including CCT6B, SALL3, FTCD and PC. ID3
mutations occurred in 34% of Burkitt lymphomas and not in
DLBCLs. We show experimentally that ID3 mutations promote
cell cycle progression and proliferation. Our work thus
elucidates commonly occurring gene-coding mutations in
Burkitt lymphoma and implicates ID3 as a new tumor
suppressor gene.},
Doi = {10.1038/ng.2468},
Key = {fds257881}
}
@article{fds257976,
Author = {Montagna, S and Tokdar, ST and Neelon, B and Dunson,
DB},
Title = {Bayesian latent factor regression for functional and
longitudinal data.},
Journal = {Biometrics},
Volume = {68},
Number = {4},
Pages = {1064-1073},
Year = {2012},
Month = {December},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23005895},
Abstract = {In studies involving functional data, it is commonly of
interest to model the impact of predictors on the
distribution of the curves, allowing flexible effects on not
only the mean curve but also the distribution about the
mean. Characterizing the curve for each subject as a linear
combination of a high-dimensional set of potential basis
functions, we place a sparse latent factor regression model
on the basis coefficients. We induce basis selection by
choosing a shrinkage prior that allows many of the loadings
to be close to zero. The number of latent factors is treated
as unknown through a highly-efficient, adaptive-blocked
Gibbs sampler. Predictors are included on the latent
variables level, while allowing different predictors to
impact different latent factors. This model induces a
framework for functional response regression in which the
distribution of the curves is allowed to change flexibly
with predictors. The performance is assessed through
simulation studies and the methods are applied to data on
blood pressure trajectories during pregnancy.},
Doi = {10.1111/j.1541-0420.2012.01788.x},
Key = {fds257976}
}
@article{fds258004,
Author = {Zhu, B and Dunson, DB and Ashley-Koch, AE},
Title = {Adverse subpopulation regression for multivariate outcomes
with high-dimensional predictors.},
Journal = {Stat Med},
Volume = {31},
Number = {29},
Pages = {4102-4113},
Year = {2012},
Month = {December},
url = {http://www.ncbi.nlm.nih.gov/pubmed/22825854},
Abstract = {Biomedical studies have a common interest in assessing
relationships between multiple related health outcomes and
high-dimensional predictors. For example, in reproductive
epidemiology, one may collect pregnancy outcomes such as
length of gestation and birth weight and predictors such as
single nucleotide polymorphisms in multiple candidate genes
and environmental exposures. In such settings, there is a
need for simple yet flexible methods for selecting true
predictors of adverse health responses from a
high-dimensional set of candidate predictors. To address
this problem, one may either consider linear regression
models for the continuous outcomes or convert these outcomes
into binary indicators of adverse responses using predefined
cutoffs. The former strategy has the disadvantage of often
leading to a poorly fitting model that does not predict risk
well, whereas the latter approach can be very sensitive to
the cutoff choice. As a simple yet flexible alternative, we
propose a method for adverse subpopulation regression, which
relies on a two-component latent class model, with the
dominant component corresponding to (presumed) healthy
individuals and the risk of falling in the minority
component characterized via a logistic regression. The
logistic regression model is designed to accommodate
high-dimensional predictors, as occur in studies with a
large number of gene by environment interactions, through
the use of a flexible nonparametric multiple shrinkage
approach. The Gibbs sampler is developed for posterior
computation. We evaluate the methods with the use of
simulation studies and apply these to a genetic epidemiology
study of pregnancy outcomes.},
Doi = {10.1002/sim.5520},
Key = {fds258004}
}
@article{fds257851,
Author = {Carlson, DE and Vogelstein, JT and Wu, Q and Lian, W and Zhou, M and Stoetzner, CR and Kipke, D and Weber, D and Dunson, DB and Carin,
L},
Title = {Multichannel electrophysiological spike sorting via joint
dictionary learning and mixture modeling},
Journal = {IEEE Transactions on Biomedical Engineering},
Volume = {61},
Number = {1},
Pages = {41-54},
Publisher = {IEEE},
Year = {2013},
ISSN = {0018-9294},
url = {http://dx.doi.org/10.1109/tbme.2013.2275751},
Abstract = {We propose a methodology for joint feature learning and
clustering of multichannel extracellular
electrophysiological data, across multiple recording periods
for action potential detection and classification (sorting).
Our methodology improves over the previous state of the art
principally in four ways. First, via sharing information
across channels, we can better distinguish between
single-unit spikes and artifacts. Second, our proposed
"focused mixture model" (FMM) deals with units appearing,
disappearing, or reappearing over multiple recording days,
an important consideration for any chronic experiment.
Third, by jointly learning features and clusters, we improve
performance over previous attempts that proceeded via a
two-stage learning process. Fourth, by directly modeling
spike rate, we improve the detection of sparsely firing
neurons. Moreover, our Bayesian methodology seamlessly
handles missing data. We present the state-of-the-art
performance without requiring manually tuning
hyperparameters, considering both a public dataset with
partial ground truth and a new experimental
dataset.},
Doi = {10.1109/tbme.2013.2275751},
Key = {fds257851}
}
@article{fds257854,
Author = {Xing, Z and Nicholson, B and Jimenez, M and Veldman, T and Hudson, L and Lucas, J and Dunson, D and Zaas, AK and Woods, CW and Ginsburg, GS and Carin, L},
Title = {Bayesian modeling of temporal properties of infectious
disease in a college student population},
Journal = {Journal of Applied Statistics},
Volume = {41},
Number = {6},
Pages = {1358-1382},
Publisher = {Informa UK Limited},
Year = {2013},
ISSN = {0266-4763},
url = {http://dx.doi.org/10.1080/02664763.2013.870138},
Doi = {10.1080/02664763.2013.870138},
Key = {fds257854}
}
@article{fds304008,
Author = {Banerjee, A and Dunson, DB and Tokdar, ST},
Title = {Efficient Gaussian process regression for large
datasets},
Journal = {Biometrika},
Volume = {100},
Number = {1},
Pages = {75-89},
Year = {2013},
url = {http://arxiv.org/abs/1106.5779v1},
Abstract = {Gaussian processes are widely used in nonparametric
regression, classification and spatiotemporal modelling,
facilitated in part by a rich literature on their
theoretical properties. However, one of their practical
limitations is expensive computation, typically on the order
of n3 where n is the number of data points, in performing
the necessary matrix inversions. For large datasets, storage
and processing also lead to computational bottlenecks, and
numerical stability of the estimates and predicted values
degrades with increasing n. Various methods have been
proposed to address these problems, including predictive
processes in spatial data analysis and the
subset-of-regressors technique in machine learning. The idea
underlying these approaches is to use a subset of the data,
but this raises questions concerning sensitivity to the
choice of subset and limitations in estimating fine-scale
structure in regions that are not well covered by the
subset. Motivated by the literature on compressive sensing,
we propose an alternative approach that involves linear
projection of all the data points onto a lower-dimensional
subspace. We demonstrate the superiority of this approach
from a theoretical perspective and through simulated and
real data examples. © 2012 Biometrika Trust.},
Doi = {10.1093/biomet/ass068},
Key = {fds304008}
}
@article{fds257847,
Author = {Petralia, F and Vogelstein, J and Dunson, DB},
Title = {Multiscale dictionary learning for estimating conditional
distributions},
Journal = {Advances in Neural Information Processing
Systems},
Year = {2013},
Month = {January},
ISSN = {1049-5258},
Abstract = {Nonparametric estimation of the conditional distribution of
a response given highdimensional features is a challenging
problem. It is important to allow not only the mean but also
the variance and shape of the response density to change
flexibly with features, which are massive-dimensional. We
propose a multiscale dictionary learning model, which
expresses the conditional response density as a convex
combination of dictionary densities, with the densities used
and their weights dependent on the path through a tree
decomposition of the feature space. A fast graph
partitioning algorithm is applied to obtain the tree
decomposition, with Bayesian methods then used to adaptively
prune and average over different sub-trees in a soft
probabilistic manner. The algorithm scales efficiently to
approximately one million features. State of the art
predictive performance is demonstrated for toy examples and
two neuroscience applications including up to a million
features.},
Key = {fds257847}
}
@article{fds257848,
Author = {Durante, D and Scarpa, B and Dunson, DB},
Title = {Locally adaptive bayesian multivariate time
series},
Journal = {Advances in Neural Information Processing
Systems},
Year = {2013},
Month = {January},
ISSN = {1049-5258},
Abstract = {In modeling multivariate time series, it is important to
allow time-varying smoothness in the mean and covariance
process. In particular, there may be certain time intervals
exhibiting rapid changes and others in which changes are
slow. If such locally adaptive smoothness is not accounted
for, one can obtain misleading inferences and predictions,
with over-smoothing across erratic time intervals and
under-smoothing across times exhibiting slow variation. This
can lead to miscalibration of predictive intervals, which
can be substantially too narrow or wide depending on the
time. We propose a continuous multivariate stochastic
process for time series having locally varying smoothness in
both the mean and covariance matrix. This process is
constructed utilizing latent dictionary functions in time,
which are given nested Gaussian process priors and linearly
related to the observed data through a sparse mapping. Using
a differential equation representation, we bypass usual
computational bottlenecks in obtaining MCMC and online
algorithms for approximate Bayesian inference. The
performance is assessed in simulations and illustrated in a
financial application.},
Key = {fds257848}
}
@article{fds257855,
Author = {Zhu, B and Dunson, DB},
Title = {Locally Adaptive Bayes Nonparametric Regression via Nested
Gaussian Processes.},
Journal = {Journal of the American Statistical Association},
Volume = {108},
Number = {504},
Year = {2013},
Month = {January},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2013.838568},
Abstract = {We propose a nested Gaussian process (nGP) as a locally
adaptive prior for Bayesian nonparametric regression.
Specified through a set of stochastic differential equations
(SDEs), the nGP imposes a Gaussian process prior for the
function's <i>m</i>th-order derivative. The nesting comes in
through including a local instantaneous mean function, which
is drawn from another Gaussian process inducing adaptivity
to locally-varying smoothness. We discuss the support of the
nGP prior in terms of the closure of a reproducing kernel
Hilbert space, and consider theoretical properties of the
posterior. The posterior mean under the nGP prior is shown
to be equivalent to the minimizer of a nested penalized
sum-of-squares involving penalties for both the global and
local roughness of the function. Using highly-efficient
Markov chain Monte Carlo for posterior inference, the
proposed method performs well in simulation studies compared
to several alternatives, and is scalable to massive data,
illustrated through a proteomics application.},
Doi = {10.1080/01621459.2013.838568},
Key = {fds257855}
}
@article{fds257856,
Author = {Kunihama, T and Dunson, DB},
Title = {Bayesian modeling of temporal dependence in large sparse
contingency tables.},
Journal = {Journal of the American Statistical Association},
Volume = {108},
Number = {504},
Pages = {1324-1338},
Year = {2013},
Month = {January},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2013.823866},
Abstract = {In many applications, it is of interest to study trends over
time in relationships among categorical variables, such as
age group, ethnicity, religious affiliation, political party
and preference for particular policies. At each time point,
a sample of individuals provide responses to a set of
questions, with different individuals sampled at each time.
In such settings, there tends to be abundant missing data
and the variables being measured may change over time. At
each time point, one obtains a large sparse contingency
table, with the number of cells often much larger than the
number of individuals being surveyed. To borrow information
across time in modeling large sparse contingency tables, we
propose a Bayesian autoregressive tensor factorization
approach. The proposed model relies on a probabilistic
Parafac factorization of the joint pmf characterizing the
categorical data distribution at each time point, with
autocorrelation included across times. Efficient
computational methods are developed relying on MCMC. The
methods are evaluated through simulation examples and
applied to social survey data.},
Doi = {10.1080/01621459.2013.823866},
Key = {fds257856}
}
@article{fds257864,
Author = {Armagan, A and Dunson, DB and Lee, J},
Title = {GENERALIZED DOUBLE PARETO SHRINKAGE.},
Journal = {Statistica Sinica},
Volume = {23},
Number = {1},
Pages = {119-143},
Year = {2013},
Month = {January},
ISSN = {1017-0405},
url = {http://dx.doi.org/10.5705/ss.2011.048},
Abstract = {We propose a generalized double Pareto prior for Bayesian
shrinkage estimation and inferences in linear models. The
prior can be obtained via a scale mixture of Laplace or
normal distributions, forming a bridge between the Laplace
and Normal-Jeffreys' priors. While it has a spike at zero
like the Laplace density, it also has a Student's
<i>t</i>-like tail behavior. Bayesian computation is
straightforward via a simple Gibbs sampling algorithm. We
investigate the properties of the maximum a posteriori
estimator, as sparse estimation plays an important role in
many problems, reveal connections with some well-established
regularization procedures, and show some asymptotic results.
The performance of the prior is tested through simulations
and an application.},
Doi = {10.5705/ss.2011.048},
Key = {fds257864}
}
@article{fds257870,
Author = {Chen, B and Polatkan, G and Sapiro, G and Blei, D and Dunson, D and Carin,
L},
Title = {Deep Learning with Hierarchical Convolutional Factor
Analysis.},
Journal = {IEEE transactions on pattern analysis and machine
intelligence},
Year = {2013},
Month = {January},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23319498},
Abstract = {Unsupervised multi-layered ("deep") models are considered
for general data, with a particular focus on imagery. The
model is represented using a hierarchical convolutional
factor-analysis construction, with sparse factor loadings
and scores. The computation of layer-dependent model
parameters is implemented within a Bayesian setting,
employing a Gibbs sampler and variational Bayesian (VB)
analysis, that explicitly exploit the convolutional nature
of the expansion. In order to address large-scale and
streaming data, an online version of VB is also developed.
The number of basis functions or dictionary elements at each
layer is inferred from the data, based on a beta-Bernoulli
implementation of the Indian buffet process. Example results
are presented for several image-processing applications,
with comparisons to related models in the
literature.},
Key = {fds257870}
}
@article{fds257872,
Author = {Yu, K and Chen, CWS and Reed, C and Dunson, DB},
Title = {Bayesian variable selection in quantile regression},
Journal = {Statistics and its Interface},
Volume = {6},
Number = {2},
Pages = {261-274},
Publisher = {International Press of Boston},
Year = {2013},
Month = {January},
ISSN = {1938-7989},
url = {http://gateway.webofknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcApp=PARTNER_APP&SrcAuth=LinksAMR&KeyUT=WOS:000319964700009&DestLinkType=FullRecord&DestApp=ALL_WOS&UsrCustomerID=47d3190e77e5a3a53558812f597b0b92},
Abstract = {In many applications, interest focuses on assessing
relationships between predictors and the quantiles of the
distribution of a continuous response. For example, in
epidemiology studies, cutoffs to define premature delivery
have been based on the 10th percentile of the distribution
for gestational age at delivery. Using quantile regression,
one can assess how this percentile varies with predictors
instead of using a pre-defined cutoff. However, there is
typically uncertainty in which of the many candidate
predictors should be included. In order to identify
important predictors and to build accurate predictive
models, Bayesian methods for variable selection and model
averaging are very useful. However, such methods are
currently not available for quantile regression. This
article develops Bayesian methods for variable selection,
with a simple and efficient stochastic search variable
selection (SSVS) algorithm proposed for posterior
computation. This approach can be used for moderately
highdimensional variable selection and can accommodate
uncertainty in basis function selection in non-linear and
additive quantile regression models. The methods are
illustrated using simulated data and an application to the
Boston Housing data.},
Doi = {10.4310/sii.2013.v6.n2.a9},
Key = {fds257872}
}
@article{fds322560,
Author = {Johndrow, JE and Lum, K and Dunson, DB},
Title = {Diagonal orthant multinomial probit models},
Journal = {Journal of Machine Learning Research},
Volume = {31},
Pages = {29-38},
Year = {2013},
Month = {January},
Abstract = {Bayesian classification commonly relies on probit models,
with data augmentation algorithms used for posterior
computation. By imputing latent Gaussian variables, one can
often trivially adapt computational approaches used in
Gaussian models. However, MCMC for multinomial probit (MNP)
models can be inefficient in practice due to high posterior
dependence between latent variables and parameters, and to
difficulties in efficiently sampling latent variables when
there are more than two categories. To address these
problems, we propose a new class of diagonal orthant (DO)
multinomial models. The key characteristics of these models
include conditional independence of the latent variables
given model parameters, avoidance of arbitrary
identifiability restrictions, and simple expressions for
category probabilities. We show substantially improved
computational efficiency and comparable predictive
performance to MNP.},
Key = {fds322560}
}
@article{fds322561,
Author = {Banerjee, A and Murray, J and Dunson, DB},
Title = {Bayesian learning of joint distributions of
objects},
Journal = {Journal of Machine Learning Research},
Volume = {31},
Pages = {1-9},
Year = {2013},
Month = {January},
Abstract = {There is increasing interest in broad application areas in
defining flexible joint models for data having a variety of
measurement scales, while also allowing data of complex
types, such as functions, images and documents. We consider
a general framework for nonparametric Bayes joint modeling
through mixture models that incorporate dependence across
data types through a joint mixing measure. The mixing
measure is assigned a novel infinite tensor factorization
(ITF) prior that allows flexible dependence in cluster
allocation across data types. The ITF prior is formulated as
a tensor product of stick-breaking processes. Focusing on a
convenient special case corresponding to a Parafac
factorization, we provide basic theory justifying the
flexibility of the proposed prior and resulting asymptotic
properties. Focusing on ITF mixtures of product kernels, we
develop a new Gibbs sampling algorithm for routine
implementation relying on slice sampling. The methods are
compared with alternative joint mixture models based on
Dirichlet processes and related approaches through
simulations and real data applications.},
Key = {fds322561}
}
@article{fds257880,
Author = {Zhang, J and Grubor, V and Love, CL and Banerjee, A and Richards, KL and Mieczkowski, PA and Dunphy, C and Choi, W and Au, WY and Srivastava, G and Lugar, PL and Rizzieri, DA and Lagoo, AS and Bernal-Mizrachi, L and Mann, KP and Flowers, C and Naresh, K and Evens, A and Gordon, LI and Czader, M and Gill, JI and Hsi, ED and Liu, Q and Fan, A and Walsh, K and Jima, D and Smith, LL and Johnson, AJ and Byrd, JC and Luftig, MA and Ni,
T and Zhu, J and Chadburn, A and Levy, S and Dunson, D and Dave,
SS},
Title = {Genetic heterogeneity of diffuse large B-cell
lymphoma.},
Journal = {Proc Natl Acad Sci U S A},
Volume = {110},
Number = {4},
Pages = {1398-1403},
Year = {2013},
Month = {January},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23292937},
Abstract = {Diffuse large B-cell lymphoma (DLBCL) is the most common
form of lymphoma in adults. The disease exhibits a striking
heterogeneity in gene expression profiles and clinical
outcomes, but its genetic causes remain to be fully defined.
Through whole genome and exome sequencing, we characterized
the genetic diversity of DLBCL. In all, we sequenced 73
DLBCL primary tumors (34 with matched normal DNA).
Separately, we sequenced the exomes of 21 DLBCL cell lines.
We identified 322 DLBCL cancer genes that were recurrently
mutated in primary DLBCLs. We identified recurrent mutations
implicating a number of known and not previously identified
genes and pathways in DLBCL including those related to
chromatin modification (ARID1A and MEF2B), NF-κB (CARD11
and TNFAIP3), PI3 kinase (PIK3CD, PIK3R1, and MTOR), B-cell
lineage (IRF8, POU2F2, and GNA13), and WNT signaling (WIF1).
We also experimentally validated a mutation in PIK3CD, a
gene not previously implicated in lymphomas. The patterns of
mutation demonstrated a classic long tail distribution with
substantial variation of mutated genes from patient to
patient and also between published studies. Thus, our study
reveals the tremendous genetic heterogeneity that underlies
lymphomas and highlights the need for personalized medicine
approaches to treating these patients.},
Doi = {10.1073/pnas.1205299110},
Key = {fds257880}
}
@article{fds257879,
Author = {Wang, E and Salazar, E and Dunson, D and Carin, L},
Title = {Spatio-temporal modeling of legislation and
votes},
Journal = {Bayesian Analysis},
Volume = {8},
Number = {1},
Pages = {233-268},
Publisher = {Institute of Mathematical Statistics},
Year = {2013},
Month = {March},
ISSN = {1936-0975},
url = {http://dx.doi.org/10.1214/13-BA810},
Abstract = {A model is presented for analysis of multivariate binary
data with spatio-temporal dependencies, and applied to
congressional roll call data from the United States House of
Representatives and Senate. The model considers each
legislator's constituency (location), the congressional
session (time) of each vote, and the details (text) of each
piece of legislation. The model can predict votes of new
legislation from only text, while imposing smooth temporal
evolution of legislator latent features, and correlation of
legislators with adjacent constituencies. Additionally, the
model estimates the number of latent dimensions required to
represent the data. A Gibbs sampler is developed for
posterior inference. The model is demonstrated as an
exploratory tool of legislation and it performs well in
quantitative comparisons to a traditional ideal-point model.
© 2013 International Society for Bayesian
Analysis.},
Doi = {10.1214/13-BA810},
Key = {fds257879}
}
@article{fds257877,
Author = {Pati, D and Dunson, DB and Tokdar, ST},
Title = {Posterior consistency in conditional distribution
estimation.},
Journal = {Journal of multivariate analysis},
Volume = {116},
Pages = {456-472},
Year = {2013},
Month = {April},
ISSN = {0047-259X},
url = {http://dx.doi.org/10.1016/j.jmva.2013.01.011},
Abstract = {A wide variety of priors have been proposed for
nonparametric Bayesian estimation of conditional
distributions, and there is a clear need for theorems
providing conditions on the prior for large support, as well
as posterior consistency. Estimation of an uncountable
collection of conditional distributions across different
regions of the predictor space is a challenging problem,
which differs in some important ways from density and mean
regression estimation problems. Defining various topologies
on the space of conditional distributions, we provide
sufficient conditions for posterior consistency focusing on
a broad class of priors formulated as predictor-dependent
mixtures of Gaussian kernels. This theory is illustrated by
showing that the conditions are satisfied for a class of
generalized stick-breaking process mixtures in which the
stick-breaking lengths are monotone, differentiable
functions of a continuous stochastic process. We also
provide a set of sufficient conditions for the case where
stick-breaking lengths are predictor independent, such as
those arising from a fixed Dirichlet process
prior.},
Doi = {10.1016/j.jmva.2013.01.011},
Key = {fds257877}
}
@article{fds257873,
Author = {Page, G and Bhattacharya, A and Dunson, D},
Title = {Classification via bayesian nonparametric learning of affine
subspaces},
Journal = {Journal of the American Statistical Association},
Volume = {108},
Number = {501},
Pages = {187-201},
Publisher = {Informa UK Limited},
Year = {2013},
Month = {May},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2013.763566},
Abstract = {It has become common for datasets to contain large numbers
of variables in studies conducted in areas such as genetics,
machine vision, image analysis, and many others. When
analyzing such data, parametric models are often too
inflexible while nonparametric procedures tend to be
nonrobust because of insufficient data on these
high-dimensional spaces. This is particularly true when
interest lies in building efficient classifiers in the
presence of many predictor variables. When dealing with
these types of data, it is often the case that most of the
variability tends to lie along a few directions, or more
generally along a much smaller dimensional submanifold of
the data space. In this article, we propose a class of
models that flexibly learn about this submanifold while
simultaneously performing dimension reduction in
classification. This methodology allows the cell
probabilities to vary nonparametrically based on a few
coordinates expressed as linear combinations of the
predictors. Also, as opposed to many black-box methods for
dimensionality reduction, the proposed model is appealing in
having clearly interpretable and identifiable parameters
that provide insight into which predictors are important in
determining accurate classification boundaries. Gibbs
sampling methods are developed for posterior computation,
and the methods are illustrated using simulated and real
data applications. © 2013 American Statistical
Association.},
Doi = {10.1080/01621459.2013.763566},
Key = {fds257873}
}
@article{fds304007,
Author = {Murray, JS and Dunson, DB and Carin, L and Lucas,
JE},
Title = {Bayesian Gaussian Copula Factor Models for Mixed
Data.},
Journal = {Journal of the American Statistical Association},
Volume = {108},
Number = {502},
Pages = {656-665},
Year = {2013},
Month = {June},
url = {http://arxiv.org/abs/1111.0317v2},
Abstract = {Gaussian factor models have proven widely useful for
parsimoniously characterizing dependence in multivariate
data. There is a rich literature on their extension to mixed
categorical and continuous variables, using latent Gaussian
variables or through generalized latent trait models
acommodating measurements in the exponential family.
However, when generalizing to non-Gaussian measured
variables the latent variables typically influence both the
dependence structure and the form of the marginal
distributions, complicating interpretation and introducing
artifacts. To address this problem we propose a novel class
of Bayesian Gaussian copula factor models which decouple the
latent factors from the marginal distributions. A
semiparametric specification for the marginals based on the
extended rank likelihood yields straightforward
implementation and substantial computational gains. We
provide new theoretical and empirical justifications for
using this likelihood in Bayesian inference. We propose new
default priors for the factor loadings and develop efficient
parameter-expanded Gibbs sampling for posterior computation.
The methods are evaluated through simulations and applied to
a dataset in political science. The models in this paper are
implemented in the R package bfa.},
Doi = {10.1080/01621459.2012.762328},
Key = {fds304007}
}
@article{fds257871,
Author = {Zhu, B and Ashley-Koch, AE and Dunson, DB},
Title = {Generalized admixture mapping for complex
traits.},
Journal = {G3 (Bethesda)},
Volume = {3},
Number = {7},
Pages = {1165-1175},
Year = {2013},
Month = {July},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23665878},
Abstract = {Admixture mapping is a popular tool to identify regions of
the genome associated with traits in a recently admixed
population. Existing methods have been developed primarily
for identification of a single locus influencing a
dichotomous trait within a case-control study design. We
propose a generalized admixture mapping (GLEAM) approach, a
flexible and powerful regression method for both
quantitative and qualitative traits, which is able to test
for association between the trait and local ancestries in
multiple loci simultaneously and adjust for covariates. The
new method is based on the generalized linear model and uses
a quadratic normal moment prior to incorporate admixture
prior information. Through simulation, we demonstrate that
GLEAM achieves lower type I error rate and higher power than
ANCESTRYMAP both for qualitative traits and more
significantly for quantitative traits. We applied GLEAM to
genome-wide SNP data from the Illumina African American
panel derived from a cohort of black women participating in
the Healthy Pregnancy, Healthy Baby study and identified a
locus on chromosome 2 associated with the averaged maternal
mean arterial pressure during 24 to 28 weeks of
pregnancy.},
Doi = {10.1534/g3.113.006478},
Key = {fds257871}
}
@article{fds257868,
Author = {Salazar, E and Dunson, DB and Carin, L},
Title = {Analysis of space-time relational data with application to
legislative voting},
Journal = {Computational Statistics and Data Analysis},
Volume = {68},
Pages = {141-154},
Publisher = {Elsevier BV},
Year = {2013},
Month = {July},
ISSN = {0167-9473},
url = {http://dx.doi.org/10.1016/j.csda.2013.06.018},
Abstract = {We consider modeling spatio-temporally indexed relational
data, motivated by analysis of voting data for the United
States House of Representatives over two decades. The data
are characterized by incomplete binary matrices,
representing votes of legislators on legislation over time.
The spatial covariates correspond to the location of a
legislator's district, and time corresponds to the year of a
vote. We seek to infer latent features associated with
legislators and legislation, incorporating spatio-temporal
structure. A model of such data must impose a flexible
representation of the space-time structure, since the
apportionment of House seats and the total number of
legislators change over time. There are 435 congressional
districts, with one legislator at a time for each district;
however, the total number of legislators typically changes
from year to year, for example due to deaths. A matrix
kernel stick-breaking process (MKSBP) is proposed, with the
model employed within a probit-regression construction.
Theoretical properties of the model are discussed and
posterior inference is developed using Markov chain Monte
Carlo methods. Advantages over benchmark models are shown in
terms of vote prediction and treatment of missing data.
Marked improvements in results are observed based on
leveraging spatial (geographical) information. © 2013
Elsevier B.V. All rights reserved.},
Doi = {10.1016/j.csda.2013.06.018},
Key = {fds257868}
}
@article{fds257869,
Author = {Lock, EF and Dunson, DB},
Title = {Bayesian consensus clustering.},
Journal = {Bioinformatics (Oxford, England)},
Volume = {29},
Number = {20},
Pages = {2610-2616},
Year = {2013},
Month = {October},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23990412},
Abstract = {<h4>Motivation</h4>In biomedical research a growing number
of platforms and technologies are used to measure diverse
but related information, and the task of clustering a set of
objects based on multiple sources of data arises in several
applications. Most current approaches to multisource
clustering either independently determine a separate
clustering for each data source or determine a single
'joint' clustering for all data sources. There is a need for
more flexible approaches that simultaneously model the
dependence and the heterogeneity of the data
sources.<h4>Results</h4>We propose an integrative
statistical model that permits a separate clustering of the
objects for each data source. These separate clusterings
adhere loosely to an overall consensus clustering, and hence
they are not independent. We describe a computationally
scalable Bayesian framework for simultaneous estimation of
both the consensus clustering and the source-specific
clusterings. We demonstrate that this flexible approach is
more robust than joint clustering of all data sources, and
is more powerful than clustering each data source
independently. We present an application to subtype
identification of breast cancer tumor samples using publicly
available data from The Cancer Genome Atlas.<h4>Availability</h4>R
code with instructions and examples is available at
http://people.duke.edu/%7Eel113/software.html.},
Doi = {10.1093/bioinformatics/btt425},
Key = {fds257869}
}
@article{fds257859,
Author = {Hannah, LA and Dunson, DB},
Title = {Multivariate convex regression with adaptive
partitioning},
Journal = {Journal of Machine Learning Research},
Volume = {14},
Pages = {3153-3188},
Publisher = {MICROTOME PUBL},
Year = {2013},
Month = {November},
ISSN = {1532-4435},
Abstract = {We propose a new, nonparametric method for multivariate
regression subject to convexity or concavity constraints on
the response function. Convexity constraints are common in
economics, statistics, operations research, financial
engineering and optimization, but there is currently no
multivariate method that is stable and computationally
feasible for more than a few thousand observations. We
introduce convex adaptive partitioning (CAP), which creates
a globally convex regression model from locally linear
estimates fit on adaptively selected covariate partitions.
CAP is a computationally efficient, consistent method for
convex regression. We demonstrate empirical performance by
comparing the performance of CAP to other shape-constrained
and unconstrained regression methods for predicting weekly
wages and value function approximation for pricing American
basket options. © 2013 Lauren A. Hannah and David B.
Dunson.},
Key = {fds257859}
}
@article{fds257867,
Author = {Li, D and Longnecker, MP and Dunson, DB},
Title = {Lipid adjustment for chemical exposures: accounting for
concomitant variables.},
Journal = {Epidemiology (Cambridge, Mass.)},
Volume = {24},
Number = {6},
Pages = {921-928},
Year = {2013},
Month = {November},
url = {http://www.ncbi.nlm.nih.gov/pubmed/24051893},
Abstract = {<h4>Background</h4>Some environmental chemical exposures are
lipophilic and need to be adjusted by serum lipid levels
before data analyses. There are currently various strategies
that attempt to account for this problem, but all have their
drawbacks. To address such concerns, we propose a new method
that uses Box-Cox transformations and a simple Bayesian
hierarchical model to adjust for lipophilic chemical
exposures.<h4>Methods</h4>We compared our Box-Cox method to
existing methods. We ran simulation studies in which
increasing levels of lipid-adjusted chemical exposure did
and did not increase the odds of having a disease, and we
looked at both single-exposure and multiple-exposure cases.
We also analyzed an epidemiology dataset that examined the
effects of various chemical exposure on the risk of birth
defects.<h4>Results</h4>Compared with existing methods, our
Box-Cox method produced unbiased estimates, good coverage,
similar power, and lower type I error rates. This was the
case in both single- and multiple-exposure simulation
studies. Results from analysis of the birth-defect data
differed from results using existing methods.<h4>Conclusion</h4>Our
Box-Cox method is a novel and intuitive way to account for
the lipophilic nature of certain chemical exposures. It
addresses some of the problems with existing methods, is
easily extendable to multiple exposures, and can be used in
any analysis that involves concomitant variables.},
Doi = {10.1097/ede.0b013e3182a671e4},
Key = {fds257867}
}
@article{fds376605,
Author = {Hannah, LA and Dunson, DB},
Title = {Multivariate Convex Regression with Adaptive
Partitioning},
Journal = {JOURNAL OF MACHINE LEARNING RESEARCH},
Volume = {14},
Pages = {3261-3294},
Publisher = {MICROTOME PUBL},
Year = {2013},
Month = {November},
Key = {fds376605}
}
@article{fds257860,
Author = {Armagan, A and Dunson, DB and Lee, J and Bajwa, WU and Strawn,
N},
Title = {Posterior consistency in linear models under shrinkage
priors},
Journal = {Biometrika},
Volume = {100},
Number = {4},
Pages = {1011-1018},
Publisher = {Oxford University Press (OUP)},
Year = {2013},
Month = {December},
ISSN = {0006-3444},
url = {http://gateway.webofknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcApp=PARTNER_APP&SrcAuth=LinksAMR&KeyUT=WOS:000327714200017&DestLinkType=FullRecord&DestApp=ALL_WOS&UsrCustomerID=47d3190e77e5a3a53558812f597b0b92},
Abstract = {We investigate the asymptotic behaviour of posterior
distributions of regression coefficients in high-dimensional
linear models as the number of dimensions grows with the
number of observations. We show that the posterior
distribution concentrates in neighbourhoods of the true
parameter under simple sufficient conditions. These
conditions hold under popular shrinkage priors given some
sparsity assumptions. © 2013 Biometrika
Trust.},
Doi = {10.1093/biomet/ast028},
Key = {fds257860}
}
@article{fds257861,
Author = {Canale, A and Dunson, DB},
Title = {Nonparametric Bayes modelling of count processes},
Journal = {Biometrika},
Volume = {100},
Number = {4},
Pages = {801-816},
Publisher = {Oxford University Press (OUP)},
Year = {2013},
Month = {December},
ISSN = {0006-3444},
url = {http://gateway.webofknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcApp=PARTNER_APP&SrcAuth=LinksAMR&KeyUT=WOS:000327714200002&DestLinkType=FullRecord&DestApp=ALL_WOS&UsrCustomerID=47d3190e77e5a3a53558812f597b0b92},
Abstract = {Data on count processes arise in a variety of applications,
including longitudinal, spatial and imaging studies
measuring count responses. The literature on statistical
models for dependent count data is dominated by models built
from hierarchical Poisson components. The Poisson assumption
is not warranted in many applied contexts, and hierarchical
Poisson models make restrictive assumptions about
overdispersion in marginal distributions. In this article we
propose a class of nonparametric Bayes count process models,
constructed through rounding real-valued underlying
processes. The proposed class of models accommodates
situations in which separate count-valued functional data
are observed for each subject under study. Theoretical
results on large support and posterior consistency are
established, and computational algorithms are developed
based on Markov chain Monte Carlo simulation. The methods
are evaluated via simulation and illustrated by application
to longitudinal tumour counts and to asthma inhaler usage.
© 2013 Biometrika Trust.},
Doi = {10.1093/biomet/ast037},
Key = {fds257861}
}
@article{fds304006,
Author = {Cornelis, B and Yang, Y and Vogelstein, JT and Dooms, A and Daubechies,
I and Dunson, D},
Title = {Bayesian crack detection in ultra high resolution multimodal
images of paintings},
Journal = {2013 18th International Conference on Digital Signal
Processing, DSP 2013},
Year = {2013},
Month = {December},
url = {http://arxiv.org/abs/1304.5894v2},
Abstract = {The preservation of our cultural heritage is of paramount
importance. Thanks to recent developments in digital
acquisition techniques, powerful image analysis algorithms
are developed which can be useful non-invasive tools to
assist in the restoration and preservation of art. In this
paper we propose a semi-supervised crack detection method
that can be used for high-dimensional acquisitions of
paintings coming from different modalities. Our dataset
consists of a recently acquired collection of images of the
Ghent Altarpiece (1432), one of Northern Europe's most
important art masterpieces. Our goal is to build a
classifier that is able to discern crack pixels from the
background consisting of non-crack pixels, making optimal
use of the information that is provided by each modality. To
accomplish this we employ a recently developed
non-parametric Bayesian classifier, that uses tensor
factorizations to characterize any conditional probability.
A prior is placed on the parameters of the factorization
such that every possible interaction between predictors is
allowed while still identifying a sparse subset among these
predictors. The proposed Bayesian classifier, which we will
refer to as conditional Bayesian tensor factorization or
CBTF, is assessed by visually comparing classification
results with the Random Forest (RF) algorithm. © 2013
IEEE.},
Doi = {10.1109/ICDSP.2013.6622710},
Key = {fds304006}
}
@article{fds257824,
Author = {Rai, P and Wang, Y and Guo, S and Chen, G and Dunson, D and Carin,
L},
Title = {Scalable bayesian low-rank decomposition of incomplete
multiway tensors},
Journal = {31st International Conference on Machine Learning, ICML
2014},
Volume = {5},
Pages = {3810-3820},
Year = {2014},
Month = {January},
ISBN = {9781634393973},
Abstract = {We present a scalable Bayesian framework for low-rank
decomposition of multiway tensor data with missing
observations. The key issue of pre-specifying the rank of
the decomposition is sidestepped in a principled manner
using a multiplicative gamma process prior. Both continuous
and binary data can be analyzed under the framework, in a
coherent way using fully conjugate Bayesian analysis. In
particular, the analysis in the non-conjugate binary case is
facilitated via the use of the Pólya-Gamma sampling
strategy which elicits closed-form Gibbs sampling updates.
The resulting samplers are efficient and enable us to apply
our framework to large-scale problems, with time-complexity
that is linear in the number of observed entries in the
tensor. This is especially attractive in analyzing very
large but sparsely observed tensors with very few known
entries. Moreover, our method admits easy extension to the
supervised setting where entities in one or more tensor
modes have labels. Our method outperforms several
state-of-the-art tensor decomposition methods on various
synthetic and benchmark real-world datasets.},
Key = {fds257824}
}
@article{fds257826,
Author = {Wang, X and Peng, P and Dunson, DB},
Title = {Median selection subset aggregation for parallel
inference},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {3},
Number = {January},
Pages = {2195-2203},
Year = {2014},
Month = {January},
ISSN = {1049-5258},
Abstract = {For massive data sets, efficient computation commonly relies
on distributed algorithms that store and process subsets of
the data on different machines, minimizing communication
costs. Our focus is on regression and classification
problems involving many features. A variety of distributed
algorithms have been proposed in this context, but
challenges arise in defining an algorithm with low
communication, theoretical guarantees and excellent
practical performance in general settings. We propose a
MEdian Selection Subset AGgregation Estimator (message)
algorithm, which attempts to solve these problems. The
algorithm applies feature selection in parallel for each
subset using Lasso or another method, calculates the
'median' feature inclusion index, estimates coefficients for
the selected features in parallel for each subset, and then
averages these estimates. The algorithm is simple, involves
very minimal communication, scales efficiently in both
sample and feature size, and has theoretical guarantees. In
particular, we show model selection consistency and
coefficient estimation efficiency. Extensive experiments
show excellent performance in variable selection,
estimation, prediction, and computation time relative to
usual competitors.},
Key = {fds257826}
}
@article{fds257831,
Author = {Minsker, S and Srivastava, S and Lin, L and Dunson,
DB},
Title = {Scalable and robust Bayesian inference via the median
posterior},
Journal = {31st International Conference on Machine Learning, ICML
2014},
Volume = {5},
Pages = {3629-3639},
Year = {2014},
Month = {January},
ISBN = {9781634393973},
Abstract = {Many Bayesian learning methods for massive data benefit from
working with small subsets of observations. In particular,
significant progress has been made in scalable Bayesian
learning via stochastic approximation. However, Bayesian
learning methods in distributed computing environments are
often problem- or distribution-specific and use ad hoc
techniques. We propose a novel general approach to Bayesian
inference that is scalable and robust to corruption in the
data. Our technique is based on the idea of splitting the
data into several non-overlapping subgroups, evaluating the
posterior distribution given each independent subgroup, and
then combining the results. Our main contribution is the
proposed aggregation step which is based on finding the
geometric median of subset posterior distributions.
Presented theoretical and numerical results confirm the
advantages of our approach.},
Key = {fds257831}
}
@article{fds257837,
Author = {Kundu, S and Dunson, DB},
Title = {Latent factor models for density estimation},
Journal = {Biometrika},
Volume = {101},
Number = {3},
Pages = {641-654},
Publisher = {Oxford University Press (OUP)},
Year = {2014},
Month = {January},
ISSN = {0006-3444},
url = {http://dx.doi.org/10.1093/biomet/asu019},
Abstract = {Although discrete mixture modelling has formed the backbone
of the literature on Bayesian density estimation, there are
some well-known disadvantages. As an alternative to discrete
mixtures, we propose a class of priors based on random
nonlinear functions of a uniform latent variable with an
additive residual. The induced prior for the density is
shown to have desirable properties, including ease of
centring on an initial guess, large support, posterior
consistency and straightforward computation via Gibbs
sampling. Some advantages over discrete mixtures, such as
Dirichlet process mixtures of Gaussian kernels, are
discussed and illustrated via simulations and an
application. © 2014 Biometrika Trust.},
Doi = {10.1093/biomet/asu019},
Key = {fds257837}
}
@article{fds257838,
Author = {Hannah, LA and Powell, WB and Dunson, DB},
Title = {Semiconvex regression for metamodeling-based
optimization},
Journal = {SIAM Journal on Optimization},
Volume = {24},
Number = {2},
Pages = {573-597},
Publisher = {Society for Industrial & Applied Mathematics
(SIAM)},
Year = {2014},
Month = {January},
ISSN = {1052-6234},
url = {http://dx.doi.org/10.1137/130907070},
Abstract = {Stochastic search involves finding a set of controllable
parameters that minimizes an unknown objective function
using a set of noisy observations. We consider the case when
the unknown function is convex and a metamodel is used as a
surrogate objective function. Often he data are non-i.i.d.
and include an observable state variable, such as applicant
information in a loan rate decision problem. State
information is difficult to incorporate into convex models.
We propose a new semiconvex regression method that is used
to produce a convex metamodel in the presence of a state
variable. We show consistency for this method. We
demonstrate its effectiveness for metamodeling on a set of
synthetic inventory management problems and a large
real-life auto loan dataset. © 2014 Society for Industrial
and Applied Mathematics.},
Doi = {10.1137/130907070},
Key = {fds257838}
}
@article{fds257840,
Author = {Pati, D and Bhattacharya, A and Pillai, NS and Dunson,
D},
Title = {Posterior contraction in sparse bayesian factor models for
massive covariance matrices},
Journal = {Annals of Statistics},
Volume = {42},
Number = {3},
Pages = {1102-1130},
Publisher = {Institute of Mathematical Statistics},
Year = {2014},
Month = {January},
ISSN = {0090-5364},
url = {http://dx.doi.org/10.1214/14-AOS1215},
Abstract = {Sparse Bayesian factor models are routinely implemented for
parsimonious dependence modeling and dimensionality
reduction in highdimensional applications. We provide
theoretical understanding of such Bayesian procedures in
terms of posterior convergence rates in inferring
high-dimensional covariance matrices where the dimension can
be larger than the sample size. Under relevant sparsity
assumptions on the true covariance matrix, we show that
commonly-used point mass mixture priors on the factor
loadings lead to consistent estimation in the operator norm
even when pn. One of our major contributions is to develop a
new class of continuous shrinkage priors and provide
insights into their concentration around sparse vectors.
Using such priors for the factor loadings, we obtain similar
rate of convergence as obtained with point mass mixture
priors. To obtain the convergence rates, we construct test
functions to separate points in the space of
high-dimensional covariance matrices using insights from
random matrix theory; the tools developed may be of
independent interest. We also derive minimax rates and show
that the Bayesian posterior rates of convergence coincide
with the minimax rates upto a √log n term.},
Doi = {10.1214/14-AOS1215},
Key = {fds257840}
}
@article{fds257841,
Author = {Durante, D and Dunson, DB},
Title = {Bayesian dynamic financial networks with time-varying
predictors},
Journal = {Statistics and Probability Letters},
Volume = {93},
Pages = {19-26},
Publisher = {Elsevier BV},
Year = {2014},
Month = {January},
ISSN = {0167-7152},
url = {http://dx.doi.org/10.1016/j.spl.2014.06.015},
Abstract = {We propose a targeted and robust modeling of dependence in
multivariate time series via dynamic networks, with
time-varying predictors included to improve interpretation
and prediction. The model is applied to financial markets,
estimating effects of verbal and material cooperations. ©
2014 Elsevier B.V.},
Doi = {10.1016/j.spl.2014.06.015},
Key = {fds257841}
}
@article{fds257842,
Author = {Durante, D and Scarpa, B and Dunson, DB},
Title = {Locally adaptive factor processes for multivariate time
series},
Journal = {Journal of Machine Learning Research},
Volume = {15},
Pages = {1493-1522},
Year = {2014},
Month = {January},
ISSN = {1532-4435},
Abstract = {In modeling multivariate time series, it is important to
allow time-varying smoothness in the mean and covariance
process. In particular, there may be certain time intervals
exhibiting rapid changes and others in which changes are
slow. If such time-varying smoothness is not accounted for,
one can obtain misleading inferences and predictions, with
over-smoothing across erratic time intervals and
under-smoothing across times exhibiting slow variation. This
can lead to mis-calibration of predictive intervals, which
can be substantially too narrow or wide depending on the
time. We propose a locally adaptive factor process for
characterizing multivariate mean-covariance changes in
continuous time, allowing locally varying smoothness in both
the mean and covariance matrix. This process is constructed
utilizing latent dictionary functions evolving in time
through nested Gaussian processes and linearly related to
the observed data with a sparse mapping. Using a diffential
equation representation, we bypass usual computational
bottlenecks in obtaining MCMC and online algorithms for
approximate Bayesian inference. The performance is assessed
in simulations and illustrated in a financial application.
© 2014 Daniele Durante, Bruno Scarpa and David B.
Dunson.},
Key = {fds257842}
}
@article{fds257843,
Author = {Lin, L and Dunson, DB},
Title = {Bayesian monotone regression using Gaussian process
projection},
Journal = {Biometrika},
Volume = {101},
Number = {2},
Pages = {303-317},
Publisher = {Oxford University Press (OUP)},
Year = {2014},
Month = {January},
ISSN = {0006-3444},
url = {http://dx.doi.org/10.1093/biomet/ast063},
Abstract = {Shape-constrained regression analysis has applications in
dose-response modelling, environmental risk assessment,
disease screening and many other areas. Incorporating the
shape constraints can improve estimation efficiency and
avoid implausible results. We propose a novel method,
focusing on monotone curve and surface estimation, which
uses Gaussian process projections. Our inference is based on
projecting posterior samples from the Gaussian process. We
develop theory on continuity of the projection and rates of
contraction. Our approach leads to simple computation with
good performance in finite samples. The proposed projection
method can also be applied to other constrained-function
estimation problems, including those in multivariate
settings. © 2014 Biometrika Trust.},
Doi = {10.1093/biomet/ast063},
Key = {fds257843}
}
@article{fds257844,
Author = {Xing, Z and Nicholson, B and Jimenez, M and Veldman, T and Hudson, L and Lucas, J and Dunson, D and Zaas, AK and Woods, CW and Ginsburg, GS and Carin, L},
Title = {Bayesian modeling of temporal properties of infectious
disease in a college student population},
Journal = {Journal of Applied Statistics},
Volume = {41},
Number = {6},
Pages = {1358-1382},
Year = {2014},
Month = {January},
ISSN = {0266-4763},
url = {http://dx.doi.org/10.1080/02664763.2013.870138},
Abstract = {A Bayesian statistical model is developed for analysis of
the time-evolving properties of infectious disease, with a
particular focus on viruses. The model employs a latent
semi-Markovian state process, and the state-transition
statistics are driven by three terms: (i) a general
time-evolving trend of the overall population, (ii) a
semi-periodic term that accounts for effects caused by the
days of the week, and (iii) a regression term that relates
the probability of infection to covariates (here,
specifically, to the Google Flu Trends data). Computations
are performed using Markov Chain Monte Carlo sampling.
Results are presented using a novel data set: daily
self-reported symptom scores from hundreds of Duke
University undergraduate students, collected over three
academic years. The illnesses associated with these students
are (imperfectly) labeled using real-time (RT) polymerase
chain reaction (PCR) testing for several viruses, and
gene-expression data were also analyzed. The statistical
analysis is performed on the daily, self-reported symptom
scores, and the RT PCR and gene-expression data are employed
for analysis and interpretation of the model results. ©
2013 The Author(s). Published by Taylor &
Francis.},
Doi = {10.1080/02664763.2013.870138},
Key = {fds257844}
}
@article{fds257845,
Author = {Wade, S and Dunson, DB and Petrone, S and Trippa,
L},
Title = {Improving prediction from dirichlet process mixtures via
enrichment},
Journal = {Journal of Machine Learning Research},
Volume = {15},
Pages = {1041-1071},
Year = {2014},
Month = {January},
ISSN = {1532-4435},
Abstract = {Flexible covariate-dependent density estimation can be
achieved by modelling the joint density of the response and
covariates as a Dirichlet process mixture. An appealing
aspect of this approach is that computations are relatively
easy. In this paper, we examine the predictive performance
of these models with an increasing number of covariates.
Even for a moderate number of covariates, we find that the
likelihood for x tends to dominate the posterior of the
latent random partition, degrading the predictive
performance of the model. To overcome this, we suggest using
a different nonparametric prior, namely an enriched
Dirichlet process. Our proposal maintains a simple
allocation rule, so that computations remain relatively
simple. Advantages are shown through both predictive
equations and examples, including an application to
diagnosis Alzheimer's disease. © 2014 Sara Wade, David B.
Dunson, Sonia Petrone and Lorenzo Trippa.},
Key = {fds257845}
}
@article{fds257858,
Author = {Chen, CWS and Dunson, D and Frühwirth-Schnatter, S and Walker,
SG},
Title = {Special issue on Bayesian computing, methods and
applications},
Journal = {Computational Statistics and Data Analysis},
Volume = {71},
Pages = {273},
Publisher = {Elsevier BV},
Year = {2014},
Month = {January},
ISSN = {0167-9473},
url = {http://dx.doi.org/10.1016/j.csda.2013.10.011},
Doi = {10.1016/j.csda.2013.10.011},
Key = {fds257858}
}
@article{fds322557,
Author = {Bhattacharya, A and Pati, D and Dunson, D},
Title = {Anisotropic function estimation using multi-bandwidth
Gaussian processes},
Journal = {Annals of Statistics},
Volume = {42},
Number = {1},
Pages = {352-381},
Publisher = {Institute of Mathematical Statistics},
Year = {2014},
Month = {January},
url = {http://dx.doi.org/10.1214/13-AOS1192},
Abstract = {In nonparametric regression problems involving multiple
predictors, there is typically interest in estimating an
anisotropic multivariate regression surface in the important
predictors while discarding the unimportant ones. Our focus
is on defining a Bayesian procedure that leads to the
minimax optimal rate of posterior contraction (up to a log
factor) adapting to the unknown dimension and anisotropic
smoothness of the true surface. We propose such an approach
based on a Gaussian process prior with dimension-specific
scalings, which are assigned carefully-chosen hyperpriors.
We additionally show that using a homogenous Gaussian
process with a single bandwidth leads to a sub-optimal rate
in anisotropic cases.},
Doi = {10.1214/13-AOS1192},
Key = {fds322557}
}
@article{fds322558,
Author = {Durante, D and Dunson, DB},
Title = {Bayesian logistic Gaussian process models for dynamic
networks},
Journal = {Journal of Machine Learning Research},
Volume = {33},
Pages = {194-201},
Year = {2014},
Month = {January},
Abstract = {Time-varying adjacency matrices encoding the presence or
absence of a relation among entities are available in many
research fields. Motivated by an application to studying
dynamic networks among sports teams, we propose a Bayesian
nonparametric model. The proposed approach uses a logistic
mapping from the probability matrix, encoding link
probabilities between each team, to an embedded latent
relational space. Within this latent space, we incorporate a
dictionary of Gaussian process (GP) latent trajectories
characterizing changes over time in each team, while
allowing learning of the number of latent dimensions through
a specially tailored prior for the GP covariance. The model
is provably flexible and borrows strength across the network
and over time. We provide simulation experiments and an
application to the Italian soccer Championship.},
Key = {fds322558}
}
@article{fds322559,
Author = {Scarpa, B and Dunson, DB},
Title = {Enriched Stick Breaking Processes for Functional
Data.},
Journal = {Journal of the American Statistical Association},
Volume = {109},
Number = {506},
Pages = {647-660},
Year = {2014},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2013.866564},
Abstract = {In many applications involving functional data, prior
information is available about the proportion of curves
having different attributes. It is not straightforward to
include such information in existing procedures for
functional data analysis. Generalizing the functional
Dirichlet process (FDP), we propose a class of
stick-breaking priors for distributions of functions. These
priors incorporate functional atoms drawn from constrained
stochastic processes. The stick-breaking weights are
specified to allow user-specified prior probabilities for
curve attributes, with hyperpriors accommodating
uncertainty. Compared with the FDP, the random distribution
is enriched for curves having attributes known to be common.
Theoretical properties are considered, methods are developed
for posterior computation, and the approach is illustrated
using data on temperature curves in menstrual
cycles.},
Doi = {10.1080/01621459.2013.866564},
Key = {fds322559}
}
@article{fds257823,
Author = {Yin, R and Dunson, D and Cornelis, B and Brown, B and Ocon, N and Daubechies, I},
Title = {Digital cradle removal in X-ray images of art
paintings},
Journal = {2014 IEEE International Conference on Image Processing, ICIP
2014},
Pages = {4299-4303},
Publisher = {IEEE},
Year = {2014},
Month = {January},
ISBN = {9781479957514},
url = {http://dx.doi.org/10.1109/ICIP.2014.7025873},
Abstract = {We introduce an algorithm that removes the deleterious
effect of cradling on X-ray images of paintings on wooden
panels. The algorithm consists of a three stage procedure.
Firstly, the cradled regions are located automatically. The
second step consists of separating the X-ray image into a
textural and image component. In the last step the algorithm
learns to distinguish between the texture caused by the
wooden cradle and the texture belonging to the original
painted wooden panel. The results obtained with our method
are compared with those obtained manually by best current
practice.},
Doi = {10.1109/ICIP.2014.7025873},
Key = {fds257823}
}
@article{fds257850,
Author = {Cui, K and Dunson, DB},
Title = {Generalized Dynamic Factor Models for Mixed-Measurement Time
Series.},
Journal = {Journal of computational and graphical statistics : a joint
publication of American Statistical Association, Institute
of Mathematical Statistics, Interface Foundation of North
America},
Volume = {23},
Number = {1},
Pages = {169-191},
Year = {2014},
Month = {February},
ISSN = {1061-8600},
url = {http://dx.doi.org/10.1080/10618600.2012.729986},
Abstract = {In this article, we propose generalized Bayesian dynamic
factor models for jointly modeling mixed-measurement time
series. The framework allows mixed-scale measurements
associated with each time series, with different
measurements having different distributions in the
exponential family conditionally on time-varying latent
factor(s). Efficient Bayesian computational algorithms are
developed for posterior inference on both the latent factors
and model parameters, based on a Metropolis Hastings
algorithm with adaptive proposals. The algorithm relies on a
Greedy Density Kernel Approximation (GDKA) and parameter
expansion with latent factor normalization. We tested the
framework and algorithms in simulated studies and applied
them to the analysis of intertwined credit and recovery risk
for Moody's rated firms from 1982-2008, illustrating the
importance of jointly modeling mixed-measurement time
series. The article has supplemental materials available
online.},
Doi = {10.1080/10618600.2012.729986},
Key = {fds257850}
}
@article{fds257874,
Author = {Pati, D and Dunson, DB},
Title = {Bayesian nonparametric regression with varying residual
density.},
Journal = {Annals of the Institute of Statistical Mathematics},
Volume = {66},
Number = {1},
Pages = {1-31},
Year = {2014},
Month = {February},
ISSN = {0020-3157},
url = {http://dx.doi.org/10.1007/s10463-013-0415-z},
Abstract = {We consider the problem of robust Bayesian inference on the
mean regression function allowing the residual density to
change flexibly with predictors. The proposed class of
models is based on a Gaussian process prior for the mean
regression function and mixtures of Gaussians for the
collection of residual densities indexed by predictors.
Initially considering the homoscedastic case, we propose
priors for the residual density based on probit
stick-breaking (PSB) scale mixtures and symmetrized PSB
(sPSB) location-scale mixtures. Both priors restrict the
residual density to be symmetric about zero, with the sPSB
prior more flexible in allowing multimodal densities. We
provide sufficient conditions to ensure strong posterior
consistency in estimating the regression function under the
sPSB prior, generalizing existing theory focused on
parametric residual distributions. The PSB and sPSB priors
are generalized to allow residual densities to change
nonparametrically with predictors through incorporating
Gaussian processes in the stick-breaking components. This
leads to a robust Bayesian regression procedure that
automatically down-weights outliers and influential
observations in a locally-adaptive manner. Posterior
computation relies on an efficient data augmentation exact
block Gibbs sampler. The methods are illustrated using
simulated and real data applications.},
Doi = {10.1007/s10463-013-0415-z},
Key = {fds257874}
}
@article{fds257846,
Author = {Kundu, S and Dunson, DB},
Title = {Bayes variable selection in semiparametric linear
models.},
Journal = {Journal of the American Statistical Association},
Volume = {109},
Number = {505},
Pages = {437-447},
Year = {2014},
Month = {March},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2014.881153},
Abstract = {There is a rich literature on Bayesian variable selection
for parametric models. Our focus is on generalizing methods
and asymptotic theory established for mixtures of
<i>g</i>-priors to semiparametric linear regression models
having unknown residual densities. Using a Dirichlet process
location mixture for the residual density, we propose a
semiparametric <i>g</i>-prior which incorporates an unknown
matrix of cluster allocation indicators. For this class of
priors, posterior computation can proceed via a
straightforward stochastic search variable selection
algorithm. In addition, Bayes factor and variable selection
consistency is shown to result under a class of proper
priors on <i>g</i> even when the number of candidate
predictors <i>p</i> is allowed to increase much faster than
sample size <i>n</i>, while making sparsity assumptions on
the true model size.},
Doi = {10.1080/01621459.2014.881153},
Key = {fds257846}
}
@article{fds257839,
Author = {Zhang, J and Jima, D and Moffitt, AB and Liu, Q and Czader, M and Hsi, ED and Fedoriw, Y and Dunphy, CH and Richards, KL and Gill, JI and Sun, Z and Love, C and Scotland, P and Lock, E and Levy, S and Hsu, DS and Dunson, D and Dave, SS},
Title = {The genomic landscape of mantle cell lymphoma is related to
the epigenetically determined chromatin state of normal B
cells.},
Journal = {Blood},
Volume = {123},
Number = {19},
Pages = {2988-2996},
Year = {2014},
Month = {May},
ISSN = {0006-4971},
url = {http://dx.doi.org/10.1182/blood-2013-07-517177},
Abstract = {In this study, we define the genetic landscape of mantle
cell lymphoma (MCL) through exome sequencing of 56 cases of
MCL. We identified recurrent mutations in ATM, CCND1, MLL2,
and TP53. We further identified a number of novel genes
recurrently mutated in patients with MCL including RB1,
WHSC1, POT1, and SMARCA4. We noted that MCLs have a distinct
mutational profile compared with lymphomas from other B-cell
stages. The ENCODE project has defined the chromatin
structure of many cell types. However, a similar
characterization of primary human mature B cells has been
lacking. We defined, for the first time, the chromatin
structure of primary human naïve, germinal center, and
memory B cells through chromatin immunoprecipitation and
sequencing for H3K4me1, H3K4me3, H3Ac, H3K36me3, H3K27me3,
and PolII. We found that somatic mutations that occur more
frequently in either MCLs or Burkitt lymphomas were
associated with open chromatin in their respective B cells
of origin, naïve B cells, and germinal center B cells. Our
work thus elucidates the landscape of gene-coding mutations
in MCL and the critical interplay between epigenetic
alterations associated with B-cell differentiation and the
acquisition of somatic mutations in cancer.},
Doi = {10.1182/blood-2013-07-517177},
Key = {fds257839}
}
@article{fds257852,
Author = {Kessler, DC and Taylor, JA and Dunson, DB},
Title = {Learning phenotype densities conditional on many interacting
predictors.},
Journal = {Bioinformatics (Oxford, England)},
Volume = {30},
Number = {11},
Pages = {1562-1568},
Year = {2014},
Month = {June},
ISSN = {1367-4803},
url = {http://dx.doi.org/10.1093/bioinformatics/btu040},
Abstract = {<h4>Motivation</h4>Estimating a phenotype distribution
conditional on a set of discrete-valued predictors is a
commonly encountered task. For example, interest may be in
how the density of a quantitative trait varies with single
nucleotide polymorphisms and patient characteristics. The
subset of important predictors is not usually known in
advance. This becomes more challenging with a
high-dimensional predictor set when there is the possibility
of interaction.<h4>Results</h4>We demonstrate a novel
non-parametric Bayes method based on a tensor factorization
of predictor-dependent weights for Gaussian kernels. The
method uses multistage predictor selection for dimension
reduction, providing succinct models for the phenotype
distribution. The resulting conditional density morphs
flexibly with the selected predictors. In a simulation study
and an application to molecular epidemiology data, we
demonstrate advantages over commonly used
methods.},
Doi = {10.1093/bioinformatics/btu040},
Key = {fds257852}
}
@article{fds346413,
Author = {Strawn, N and Armagan, A and Saab, R and Carin, L and Dunson,
D},
Title = {Finite sample posterior concentration in high-dimensional
regression},
Journal = {Information and Inference},
Volume = {3},
Number = {2},
Pages = {103-133},
Year = {2014},
Month = {June},
url = {http://dx.doi.org/10.1093/imaiai/iau003},
Abstract = {We study the behavior of the posterior distribution in
high-dimensional Bayesian Gaussian linear regression models
having p ≫ n, where p is the number of predictors and n is
the sample size. Our focus is on obtaining quantitative
finite sample bounds ensuring sufficient posterior
probability assigned in neighborhoods of the true regression
coefficient vector (β0) with high probability. We assume
that β0 is approximately S-sparse and also obtain universal
bounds, which provide insight into the role of the prior in
controlling concentration of the posterior. Based on these
finite sample bounds, we examine the implied asymptotic
contraction rates for several examples, showing that
sparsely structured and heavy-tail shrinkage priors exhibit
rapid contraction rates. We also demonstrate that a stronger
result holds for the sparsity(S)-Gaussian1 prior. These
types of finite sample bounds provide guidelines for
designing and evaluating priors for high-dimensional
problems.},
Doi = {10.1093/imaiai/iau003},
Key = {fds346413}
}
@article{fds257836,
Author = {Wheeler, MW and Dunson, DB and Pandalai, SP and Baker, BA and Herring,
AH},
Title = {Mechanistic Hierarchical Gaussian Processes.},
Journal = {Journal of the American Statistical Association},
Volume = {109},
Number = {507},
Pages = {894-904},
Year = {2014},
Month = {July},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2014.899234},
Abstract = {The statistics literature on functional data analysis
focuses primarily on flexible black-box approaches, which
are designed to allow individual curves to have essentially
any shape while characterizing variability. Such methods
typically cannot incorporate mechanistic information, which
is commonly expressed in terms of differential equations.
Motivated by studies of muscle activation, we propose a
nonparametric Bayesian approach that takes into account
mechanistic understanding of muscle physiology. A novel
class of hierarchical Gaussian processes is defined that
favors curves consistent with differential equations defined
on motor, damper, spring systems. A Gibbs sampler is
proposed to sample from the posterior distribution and
applied to a study of rats exposed to non-injurious muscle
activation protocols. Although motivated by muscle force
data, a parallel approach can be used to include mechanistic
information in broad functional data analysis
applications.},
Doi = {10.1080/01621459.2014.899234},
Key = {fds257836}
}
@article{fds257835,
Author = {Dunson, DB},
Title = {Comment},
Journal = {Journal of the American Statistical Association},
Volume = {109},
Number = {507},
Pages = {890-891},
Publisher = {Informa UK Limited},
Year = {2014},
Month = {July},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2014.955988},
Doi = {10.1080/01621459.2014.955988},
Key = {fds257835}
}
@article{fds257834,
Author = {Rodriguez, A and Dunson, DB},
Title = {Functional clustering in nested designs: Modeling
variability in reproductive epidemiology
studies},
Journal = {Annals of Applied Statistics},
Volume = {8},
Number = {3},
Pages = {1416-1442},
Publisher = {Institute of Mathematical Statistics},
Year = {2014},
Month = {September},
ISSN = {1932-6157},
url = {http://dx.doi.org/10.1214/14-AOAS751},
Abstract = {We discuss functional clustering procedures for nested
designs, where multiple curves are collected for each
subject in the study. We start by considering the
application of standard functional clustering tools to this
problem, which leads to groupings based on the average
profile for each subject. After discussing some of the
shortcomings of this approach, we present a mixture model
based on a generalization of the nested Dirichlet process
that clusters subjects based on the distribution of their
curves. By using mixtures of generalized Dirichlet
processes, the model induces a much more flexible prior on
the partition structure than other popular model-based
clustering methods, allowing for different rates of
introduction of new clusters as the number of observations
increases. The methods are illustrated using hormone
profiles from multiple menstrual cycles collected for women
in the Early Pregnancy Study.},
Doi = {10.1214/14-AOAS751},
Key = {fds257834}
}
@article{fds257833,
Author = {Gu, K and Pati, D and Dunson, DB},
Title = {Bayesian Multiscale Modeling of Closed Curves in Point
Clouds.},
Journal = {Journal of the American Statistical Association},
Volume = {109},
Number = {508},
Pages = {1481-1494},
Year = {2014},
Month = {October},
ISSN = {0162-1459},
url = {http://dx.doi.org/10.1080/01621459.2014.934825},
Abstract = {Modeling object boundaries based on image or point cloud
data is frequently necessary in medical and scientific
applications ranging from detecting tumor contours for
targeted radiation therapy, to the classification of
organisms based on their structural information. In
low-contrast images or sparse and noisy point clouds, there
is often insufficient data to recover local segments of the
boundary in isolation. Thus, it becomes critical to model
the entire boundary in the form of a closed curve. To
achieve this, we develop a Bayesian hierarchical model that
expresses highly diverse 2D objects in the form of closed
curves. The model is based on a novel multiscale deformation
process. By relating multiple objects through a hierarchical
formulation, we can successfully recover missing boundaries
by borrowing structural information from similar objects at
the appropriate scale. Furthermore, the model's latent
parameters help interpret the population, indicating
dimensions of significant structural variability and also
specifying a 'central curve' that summarizes the collection.
Theoretical properties of our prior are studied in specific
cases and efficient Markov chain Monte Carlo methods are
developed, evaluated through simulation examples and applied
to panorex teeth images for modeling teeth contours and also
to a brain tumor contour detection problem.},
Doi = {10.1080/01621459.2014.934825},
Key = {fds257833}
}
@article{fds257865,
Author = {Yang, H and Liu, F and Ji, C and Dunson, D},
Title = {Adaptive sampling for Bayesian geospatial
models},
Journal = {Statistics and Computing},
Volume = {24},
Number = {6},
Pages = {1101-1110},
Publisher = {Springer Nature},
Year = {2014},
Month = {November},
ISSN = {0960-3174},
url = {http://dx.doi.org/10.1007/s11222-013-9422-4},
Abstract = {Bayesian hierarchical modeling with Gaussian process random
effects provides a popular approach for analyzing
point-referenced spatial data. For large spatial data sets,
however, generic posterior sampling is infeasible due to the
extremely high computational burden in decomposing the
spatial correlation matrix. In this paper, we propose an
efficient algorithm—the adaptive griddy Gibbs (AGG)
algorithm—to address the computational issues with large
spatial data sets. The proposed algorithm dramatically
reduces the computational complexity. We show theoretically
that the proposed method can approximate the real posterior
distribution accurately. The sufficient number of grid
points for a required accuracy has also been derived. We
compare the performance of AGG with that of the
state-of-the-art methods in simulation studies. Finally, we
apply AGG to spatially indexed data concerning building
energy consumption.},
Doi = {10.1007/s11222-013-9422-4},
Key = {fds257865}
}
@article{fds322556,
Author = {Durante, D and Dunson, DB},
Title = {Nonparametric Bayes dynamic modelling of relational
data},
Journal = {Biometrika},
Volume = {101},
Number = {4},
Pages = {883-898},
Publisher = {Oxford University Press (OUP)},
Year = {2014},
Month = {December},
url = {http://dx.doi.org/10.1093/biomet/asu040},
Abstract = {Symmetric binary matrices representing relations are
collected in many areas. Our focus is on dynamically
evolving binary relational matrices, with interest being on
inference on the relationship structure and prediction. We
propose a nonparametric Bayesian dynamic model, which
reduces dimensionality in characterizing the binary matrix
through a lower-dimensional latent space representation,
with the latent coordinates evolving in continuous time via
Gaussian processes. By using a logistic mapping function
from the link probability matrix space to the latent
relational space, we obtain a flexible and computationally
tractable formulation. Employing Ṕolya-gamma data
augmentation, an efficient Gibbs sampler is developed for
posterior computation, with the dimension of the latent
space automatically inferred. We provide theoretical results
on flexibility of the model, and illustrate its performance
via simulation experiments.We also consider an application
to co-movements in world financial markets.},
Doi = {10.1093/biomet/asu040},
Key = {fds322556}
}
@article{fds257827,
Author = {Chabout, J and Sarkar, A and Dunson, DB and Jarvis,
ED},
Title = {Male mice song syntax depends on social contexts and
influences female preferences.},
Journal = {Front Behav Neurosci},
Volume = {9},
Pages = {76},
Publisher = {FRONTIERS MEDIA SA},
Year = {2015},
url = {http://hdl.handle.net/10161/9544 Duke open
access},
Abstract = {In 2005, Holy and Guo advanced the idea that male mice
produce ultrasonic vocalizations (USV) with some features
similar to courtship songs of songbirds. Since then, studies
showed that male mice emit USV songs in different contexts
(sexual and other) and possess a multisyllabic repertoire.
Debate still exists for and against plasticity in their
vocalizations. But the use of a multisyllabic repertoire can
increase potential flexibility and information, in how
elements are organized and recombined, namely syntax. In
many bird species, modulating song syntax has ethological
relevance for sexual behavior and mate preferences. In this
study we exposed adult male mice to different social
contexts and developed a new approach of analyzing their
USVs based on songbird syntax analysis. We found that male
mice modify their syntax, including specific sequences,
length of sequence, repertoire composition, and spectral
features, according to stimulus and social context. Males
emit longer and simpler syllables and sequences when singing
to females, but more complex syllables and sequences in
response to fresh female urine. Playback experiments show
that the females prefer the complex songs over the simpler
ones. We propose the complex songs are to lure females in,
whereas the directed simpler sequences are used for direct
courtship. These results suggest that although mice have a
much more limited ability of song modification, they could
still be used as animal models for understanding some vocal
communication features that songbirds are used
for.},
Doi = {10.3389/fnbeh.2015.00076},
Key = {fds257827}
}
@article{fds257828,
Author = {Canale, A and Dunson, DB},
Title = {Bayesian multivariate mixed-scale density
estimation},
Journal = {Statistics and its Interface},
Volume = {8},
Number = {2},
Pages = {195-201},
Publisher = {International Press of Boston},
Year = {2015},
Month = {January},
ISSN = {1938-7989},
url = {http://dx.doi.org/10.4310/SII.2015.v8.n2.a7},
Abstract = {Although continuous density estimation has received abundant
attention in the Bayesian nonparametrics literature, there
is limited theory on multivariate mixed scale density
estimation. In this note, we consider a general framework to
jointly model continuous, count and categorical variables
under a nonparametric prior, which is induced through
rounding latent variables having an unknown density with
respect to Lebesgue measure. For the proposed class of
priors, we provide sufficient conditions for large support,
strong consistency and rates of posterior contraction. These
conditions allow one to convert sufficient conditions
obtained in the setting of multivariate continuous density
estimation to the mixed scale case. To illustrate the
procedure, a rounded multivariate nonparametric mixture of
Gaussians is introduced and applied to a crime and
communities dataset.},
Doi = {10.4310/SII.2015.v8.n2.a7},
Key = {fds257828}
}
@article{fds257849,
Author = {Kessler, DC and Hoff, PD and Dunson, DB},
Title = {Marginally specified priors for non-parametric Bayesian
estimation.},
Journal = {Journal of the Royal Statistical Society. Series B,
Statistical methodology},
Volume = {77},
Number = {1},
Pages = {35-58},
Year = {2015},
Month = {January},
ISSN = {1369-7412},
url = {http://dx.doi.org/10.1111/rssb.12059},
Abstract = {Prior specification for non-parametric Bayesian inference
involves the difficult task of quantifying prior knowledge
about a parameter of high, often infinite, dimension. A
statistician is unlikely to have informed opinions about all
aspects of such a parameter but will have real information
about functionals of the parameter, such as the population
mean or variance. The paper proposes a new framework for
non-parametric Bayes inference in which the prior
distribution for a possibly infinite dimensional parameter
is decomposed into two parts: an informative prior on a
finite set of functionals, and a non-parametric conditional
prior for the parameter given the functionals. Such priors
can be easily constructed from standard non-parametric prior
distributions in common use and inherit the large support of
the standard priors on which they are based. Additionally,
posterior approximations under these informative priors can
generally be made via minor adjustments to existing Markov
chain approximation algorithms for standard non-parametric
prior distributions. We illustrate the use of such priors in
the context of multivariate density estimation using
Dirichlet process mixture models, and in the modelling of
high dimensional sparse contingency tables.},
Doi = {10.1111/rssb.12059},
Key = {fds257849}
}
@article{fds322544,
Author = {Van Den Boom and W and Dunson, D and Reeves, G},
Title = {Quantifying uncertainty in variable selection with arbitrary
matrices},
Journal = {2015 IEEE 6th International Workshop on Computational
Advances in Multi-Sensor Adaptive Processing, CAMSAP
2015},
Pages = {385-388},
Year = {2015},
Month = {January},
ISBN = {9781479919635},
url = {http://dx.doi.org/10.1109/CAMSAP.2015.7383817},
Abstract = {Probabilistically quantifying uncertainty in parameters,
predictions and decisions is a crucial component of broad
scientific and engineering applications. This is however
difficult if the number of parameters far exceeds the sample
size. Although there are currently many methods which have
guarantees for problems characterized by large random
matrices, there is often a gap between theory and practice
when it comes to measures of statistical significance for
matrices encountered in real-world applications. This paper
proposes a scalable framework that utilizes state-of-the-art
methods to provide approximations to the marginal posterior
distributions. This framework is used to approximate
marginal posterior inclusion probabilities for Bayesian
variable selection.},
Doi = {10.1109/CAMSAP.2015.7383817},
Key = {fds322544}
}
@article{fds322551,
Author = {Zhou, J and Bhattacharya, A and Herring, A and Dunson,
D},
Title = {Bayesian factorizations of big sparse tensors.},
Journal = {Journal of the American Statistical Association},
Volume = {110},
Number = {512},
Pages = {1562-1576},
Publisher = {Informa UK Limited},
Year = {2015},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2014.983233},
Abstract = {It has become routine to collect data that are structured as
multiway arrays (tensors). There is an enormous literature
on low rank and sparse matrix factorizations, but limited
consideration of extensions to the tensor case in
statistics. The most common low rank tensor factorization
relies on parallel factor analysis (PARAFAC), which
expresses a rank k tensor as a sum of rank one tensors. When
observations are only available for a tiny subset of the
cells of a big tensor, the low rank assumption is not
sufficient and PARAFAC has poor performance. We induce an
additional layer of dimension reduction by allowing the
effective rank to vary across dimensions of the table. For
concreteness, we focus on a contingency table application.
Taking a Bayesian approach, we place priors on terms in the
factorization and develop an efficient Gibbs sampler for
posterior computation. Theory is provided showing posterior
concentration rates in high-dimensional settings, and the
methods are shown to have excellent performance in
simulations and several real data applications.},
Doi = {10.1080/01621459.2014.983233},
Key = {fds322551}
}
@article{fds322553,
Author = {Srivastava, S and Cevher, V and Tran-Dinh, Q and Dunson,
DB},
Title = {WASP: Scalable Bayes via barycenters of subset
posteriors},
Journal = {Journal of Machine Learning Research},
Volume = {38},
Pages = {912-920},
Year = {2015},
Month = {January},
Abstract = {The promise of Bayesian methods for big data sets has not
fully been realized due to the lack of scalable
computational algorithms. For massive data, it is necessary
to store and process subsets on different machines in a
distributed manner. We propose a simple, general, and highly
efficient approach, which first runs a posterior sampling
algorithm in parallel on different machines for subsets of a
large data set. To combine these subset posteriors, we
calculate the Wasserstein barycenter via a highly efficient
linear program. The resulting estimate for the Wasserstein
posterior (WASP) has an atomic form, facilitating
straightforward estimation of posterior summaries of
functionals of interest. The WASP approach allows posterior
sampling algorithms for smaller data sets to be trivially
scaled to huge data. We provide theoretical justification in
terms of posterior consistency and algorithm efficiency.
Examples are provided in complex settings including Gaussian
process regression and nonparametric Bayes mixture
models.},
Key = {fds322553}
}
@article{fds322554,
Author = {Wang, X and Leng, C and Dunson, DB},
Title = {On the consistency theory of high dimensional variable
screening},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {2015-January},
Pages = {2431-2439},
Year = {2015},
Month = {January},
Abstract = {Variable screening is a fast dimension reduction technique
for assisting high dimensional feature selection. As a
preselection method, it selects a moderate size subset of
candidate variables for further refining via feature
selection to produce the final model. The performance of
variable screening depends on both computational efficiency
and the ability to dramatically reduce the number of
variables without discarding the important ones. When the
data dimension p is substantially larger than the sample
size n, variable screening becomes crucial as 1) Faster
feature selection algorithms are needed; 2) Conditions
guaranteeing selection consistency might fail to hold. This
article studies a class of linear screening methods and
establishes consistency theory for this special class. In
particular, we prove the restricted diagonally dominant
(RDD) condition is a necessary and sufficient condition for
strong screening consistency. As concrete examples, we show
two screening methods SIS and HOLP are both strong screening
consistent (subject to additional constraints) with large
probability if n > O((ρgma;/τ)2logp) under random designs.
In addition, we relate the RDD condition to the
irrepresentable condition, and highlight limitations of
SIS.},
Key = {fds322554}
}
@article{fds322029,
Author = {Wang, X and Guo, F and Heller, KA and Dunson, DB},
Title = {Parallelizing MCMC with random partition
trees},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {2015-January},
Pages = {451-459},
Year = {2015},
Month = {January},
Abstract = {The modern scale of data has brought new challenges to
Bayesian inference. In particular, conventional MCMC
algorithms are computationally very expensive for large data
sets. A promising approach to solve this problem is
embarrassingly parallel MCMC (EP-MCMC), which first
partitions the data into multiple subsets and runs
independent sampling algorithms on each subset. The subset
posterior draws are then aggregated via some combining rules
to obtain the final approximation. Existing EP-MCMC
algorithms are limited by approximation accuracy and
difficulty in resampling. In this article, we propose a new
EP-MCMC algorithm PART that solves these problems. The new
algorithm applies random partition trees to combine the
subset posterior draws, which is distribution-free, easy to
resample from and can adapt to multiple scales. We provide
theoretical justification and extensive experiments
illustrating empirical performance.},
Key = {fds322029}
}
@article{fds322555,
Author = {Wang, Y and Dunson, D},
Title = {Probabilistic curve learning: Coulomb repulsion and the
electrostatic Gaussian process},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {2015-January},
Pages = {1738-1746},
Year = {2015},
Month = {January},
Abstract = {Learning of low dimensional structure in multidimensional
data is a canonical problem in machine learning. One common
approach is to suppose that the observed data are close to a
lower-dimensional smooth manifold. There are a rich variety
of manifold learning methods available, which allow mapping
of data points to the manifold. However, there is a clear
lack of probabilistic methods that allow learning of the
manifold along with the generative distribution of the
observed data. The best attempt is the Gaussian process
latent variable model (GP-LVM), but identifiability issues
lead to poor performance. We solve these issues by proposing
a novel Coulomb repulsive process (Corp) for locations of
points on the manifold, inspired by physical models of
electrostatic interactions among particles. Combining this
process with a GP prior for the mapping function yields a
novel electrostatic GP (electroGP) process. Focusing on the
simple case of a one-dimensional manifold, we develop
efficient inference algorithms, and illustrate substantially
improved performance in a variety of experiments including
filling in missing frames in video.},
Key = {fds322555}
}
@article{fds322543,
Author = {Kunihama, T and Dunson, DB},
Title = {Nonparametric Bayes inference on conditional
independence},
Journal = {Biometrika},
Volume = {103},
Number = {1},
Pages = {35-47},
Publisher = {Oxford University Press (OUP)},
Year = {2015},
Month = {January},
url = {http://dx.doi.org/10.1093/biomet/asv060},
Abstract = {In many application areas, a primary focus is on assessing
evidence in the data refuting the assumption of independence
of Y and X conditionally on Z, with Y response variables, X
predictors of interest, and Z covariates. Ideally, one would
have methods available that avoid parametric assumptions,
allow Y, X, Z to be random variables on arbitrary spaces
with arbitrary dimension, and accommodate rapid
consideration of different candidate predictors. As a formal
decision-theoretic approach has clear disadvantages in this
context, we instead rely on an encompassing nonparametric
Bayes model for the joint distribution of Y, X and Z, with
conditional mutual information used as a summary of the
strength of conditional dependence. We construct a
functional of the encompassing model and empirical measure
for estimation of conditional mutual information. The
implementation relies on a single Markov chain Monte Carlo
run under the encompassing model, with conditional mutual
information for candidate models calculated as a byproduct.
We provide an asymptotic theory supporting the approach, and
apply the method to variable selection. The methods are
illustrated through simulations and criminology
applications.},
Doi = {10.1093/biomet/asv060},
Key = {fds322543}
}
@article{fds257829,
Author = {Lock, EF and Soldano, KL and Garrett, ME and Cope, H and Markunas, CA and Fuchs, H and Grant, G and Dunson, DB and Gregory, SG and Ashley-Koch,
AE},
Title = {Joint eQTL assessment of whole blood and dura mater tissue
from individuals with Chiari type I malformation.},
Journal = {BMC Genomics},
Volume = {16},
Number = {1},
Pages = {11},
Year = {2015},
Month = {January},
url = {http://dx.doi.org/10.1186/s12864-014-1211-8},
Abstract = {BACKGROUND: Expression quantitative trait loci (eQTL) play
an important role in the regulation of gene expression. Gene
expression levels and eQTLs are expected to vary from tissue
to tissue, and therefore multi-tissue analyses are necessary
to fully understand complex genetic conditions in humans.
Dura mater tissue likely interacts with cranial bone growth
and thus may play a role in the etiology of Chiari Type I
Malformation (CMI) and related conditions, but it is often
inaccessible and its gene expression has not been well
studied. A genetic basis to CMI has been established;
however, the specific genetic risk factors are not well
characterized. RESULTS: We present an assessment of eQTLs
for whole blood and dura mater tissue from individuals with
CMI. A joint-tissue analysis identified 239 eQTLs in either
dura or blood, with 79% of these eQTLs shared by both
tissues. Several identified eQTLs were novel and these
implicate genes involved in bone development (IPO8, XYLT1,
and PRKAR1A), and ribosomal pathways related to marrow and
bone dysfunction, as potential candidates in the development
of CMI. CONCLUSIONS: Despite strong overall heterogeneity in
expression levels between blood and dura, the majority of
cis-eQTLs are shared by both tissues. The power to detect
shared eQTLs was improved by using an integrative
statistical approach. The identified tissue-specific and
shared eQTLs provide new insight into the genetic basis for
CMI and related conditions.},
Doi = {10.1186/s12864-014-1211-8},
Key = {fds257829}
}
@article{fds346412,
Author = {Strawn, N and Armagan, A and Saab, R and Carin, L and Dunson,
D},
Title = {Erratum: Finite sample posterior concentration in
high-dimensional regression (Information and Inference
(2015) 3 (103-133) DOI: 10.1093/imaiai/iau003)},
Journal = {Information and Inference},
Volume = {4},
Number = {1},
Pages = {77},
Year = {2015},
Month = {March},
url = {http://dx.doi.org/10.1093/imaiai/iau008},
Abstract = {Artin Armagan's and Rayan Saab's affiliations were switched
in the published version of this article. Artin Armagan's
affiliation should be: SAS Institute, Inc., Raleigh, NC,
USA; Rayan Saab's affiliation should be: Department of
Mathematics, University of California, San Diego, CA, USA.
The Publisher apologizes for this error.},
Doi = {10.1093/imaiai/iau008},
Key = {fds346412}
}
@article{fds257830,
Author = {Li, D and Wilcox, AJ and Dunson, DB},
Title = {Benchmark pregnancy rates and the assessment of post-coital
contraceptives: an update.},
Journal = {Contraception},
Volume = {91},
Number = {4},
Pages = {344-349},
Year = {2015},
Month = {April},
ISSN = {0010-7824},
url = {http://dx.doi.org/10.1016/j.contraception.2015.01.002},
Abstract = {<h4>Objective</h4>In 2001, we provided benchmark estimates
of probability of pregnancy given a single act of
intercourse. Those calculations assumed that intercourse and
ovulation are independent. Subsequent research has shown
that this assumption is not valid. We provide here an update
of previous benchmark estimates.<h4>Study design</h4>We
reanalyze earlier data from two North Carolina studies that
collected daily urine samples and recorded daily intercourse
for multiple menstrual cycles. One study comprised 68
sexually active women with either an intrauterine device or
tubal ligation. The second was of 221 women who planned to
become pregnant and had discontinued use of any birth
control at enrollment. Participants had no known fertility
problems. New statistical analyses were based on Monte Carlo
simulations and Bayesian methods.<h4>Results</h4>The
probability that a single act of intercourse occurs within a
woman's fertile window is 25%, compared with 20% in previous
calculations. The probability of pregnancy with intercourse
on a given menstrual cycle day is correspondingly higher
than previously estimated, with the largest increases
occurring on menstrual days 12-22. These increases are,
however, fairly small (for example, the peak chance of
conception on menstrual day 13 increased from 8.6% to
9.7%).<h4>Conclusions</h4>Previous benchmark rates of
pregnancy with one act of intercourse were moderately
underestimated due to a mistaken assumption about the
independence of intercourse and ovulation.<h4>Implications
statement</h4>The chance of pregnancy with a single act of
unprotected intercourse is greater than previously
estimated. Previous benchmarks may underestimate the
efficacy of post-coital contraception.},
Doi = {10.1016/j.contraception.2015.01.002},
Key = {fds257830}
}
@article{fds257863,
Author = {Hua, Z and Zhu, H and Dunson, DB},
Title = {Semiparametric Bayes local additive models for longitudinal
data.},
Journal = {Statistics in biosciences},
Volume = {7},
Number = {1},
Pages = {90-107},
Year = {2015},
Month = {May},
ISSN = {1867-1764},
url = {http://dx.doi.org/10.1007/s12561-013-9104-y},
Abstract = {In longitudinal data analysis, there is great interest in
assessing the impact of predictors on the time-varying
trajectory in a response variable. In such settings, an
important issue is to account for heterogeneity in the shape
of the trajectory among subjects, while allowing the impact
of the predictors to vary across subjects. We propose a
flexible semiparametric Bayes approach for addressing this
issue relying on a local partition process prior, which
allows flexible local borrowing of information across
subjects. Local hypothesis testing and credible bands are
developed for the identification of time windows across
which a predictor has a significant impact, while adjusting
for multiple comparisons. Posterior computation proceeds via
an efficient MCMC algorithm using the exact block Gibbs
sampler. The methods are assessed using simulation studies
and applied to a yeast cell-cycle gene expression data
set.},
Doi = {10.1007/s12561-013-9104-y},
Key = {fds257863}
}
@article{fds331654,
Author = {Johndrow, JE and Mattingly, JC and Mukherjee, S and Dunson,
D},
Title = {Optimal approximating Markov chains for Bayesian
inference},
Year = {2015},
Month = {August},
Abstract = {The Markov Chain Monte Carlo method is the dominant paradigm
for posterior computation in Bayesian analysis. It is common
to control computation time by making approximations to the
Markov transition kernel. Comparatively little attention has
been paid to computational optimality in these approximating
Markov Chains, or when such approximations are justified
relative to obtaining shorter paths from the exact kernel.
We give simple, sharp bounds for uniform approximations of
uniformly mixing Markov chains. We then suggest a notion of
optimality that incorporates computation time and
approximation error, and use our bounds to make
generalizations about properties of good approximations in
the uniformly mixing setting. The relevance of these
properties is demonstrated in applications to a
minibatching-based approximate MCMC algorithm for large $n$
logistic regression and low-rank approximations for Gaussian
processes.},
Key = {fds331654}
}
@article{fds322552,
Author = {Guo, F and Dunson, DB},
Title = {Uncovering systematic bias in ratings across categories: A
Bayesian approach},
Journal = {RecSys 2015 - Proceedings of the 9th ACM Conference on
Recommender Systems},
Pages = {317-320},
Year = {2015},
Month = {September},
ISBN = {9781450336925},
url = {http://dx.doi.org/10.1145/2792838.2799683},
Abstract = {Recommender systems are routinely equipped with standardized
taxonomy that associates each item with one or more
categories or genres. Although such information does not
directly imply the quality of an item, the distribution of
ratings vary greatly across categories, e.g. animation
movies may generally receive higher ratings than action
movies. While it is a natural outcome given the diversity
and heterogeneity of both users and items, it makes directly
aggregated ratings, which are commonly used to guide users'
choice by reecting the overall quality of an item,
incomparable across categories and hence prone to fairness
and diversity issues. This paper aims to uncover and
calibrate systematic category-wise biases for
discrete-valued ratings. We propose a novel Bayesian
multiplicative probit model that treats the ination or
deation of mean rating for a combination of categories as
multiplicatively contributed from category-specific
parameters. The posterior distribution of those parameters,
as inferred from data, can capture the bias for all possible
combinations of categories, thus enabling statistically
efficient estimation and principled rating
calibration.},
Doi = {10.1145/2792838.2799683},
Key = {fds322552}
}
@article{fds322550,
Author = {Guhaniyogi, R and Dunson, DB},
Title = {Bayesian Compressed Regression},
Journal = {Journal of the American Statistical Association},
Volume = {110},
Number = {512},
Pages = {1500-1514},
Publisher = {Informa UK Limited},
Year = {2015},
Month = {October},
url = {http://dx.doi.org/10.1080/01621459.2014.969425},
Abstract = {As an alternative to variable selection or shrinkage in
high-dimensional regression, we propose to randomly compress
the predictors prior to analysis. This dramatically reduces
storage and computational bottlenecks, performing well when
the predictors can be projected to a low-dimensional linear
subspace with minimal loss of information about the
response. As opposed to existing Bayesian dimensionality
reduction approaches, the exact posterior distribution
conditional on the compressed data is available
analytically, speeding up computation by many orders of
magnitude while also bypassing robustness issues due to
convergence and mixing problems with MCMC. Model averaging
is used to reduce sensitivity to the random projection
matrix, while accommodating uncertainty in the subspace
dimension. Strong theoretical support is provided for the
approach by showing near parametric convergence rates for
the predictive density in the large p small n asymptotic
paradigm. Practical performance relative to competitors is
illustrated in simulations and real data
applications.},
Doi = {10.1080/01621459.2014.969425},
Key = {fds322550}
}
@article{fds322546,
Author = {Fox, EB and Dunson, DB and Airoldi, EM},
Title = {Bayesian nonparametric covariance regression},
Journal = {Journal of Machine Learning Research},
Volume = {16},
Pages = {2501-2542},
Year = {2015},
Month = {December},
Abstract = {Capturing predictor-dependent correlations amongst the
elements of a multivariate response vector is fundamental to
numerous applied domains, including neuroscience,
epidemiology, and finance. Although there is a rich
literature on methods for allowing the variance in a
univariate regression model to vary with predictors,
relatively little has been done in the multivariate case. As
a motivating example, we consider the Google Flu Trends data
set, which provides indirect measurements of influenza
incidence at a large set of locations over time (our
predictor). To accurately characterize temporally evolving
influenza incidence across regions, it is important to
develop statistical methods for a time-varying covariance
matrix. Importantly, the locations provide a redundant set
of measurements and do not yield a sparse nor static spatial
dependence structure. We propose to reduce dimensionality
and induce a flexible Bayesian nonparametric covariance
regression model by relating these location-specific
trajectories to a lower-dimensional subspace through a
latent factor model with predictor-dependent factor
loadings. These loadings are in terms of a collection of
basis functions that vary nonparametrically over the
predictor space. Such low-rank approximations are in
contrast to sparse precision assumptions, and are
appropriate in a wide range of applications. Our formulation
aims to address three challenges: scaling to large p
domains, coping with missing values, and allowing an
irregular grid of observations. The model is shown to be
highly flexible, while leading to a computationally feasible
implementation via Gibbs sampling. The ability to scale to
large p domains and cope with missing values is fundamental
in analyzing the Google Flu Trends data.},
Key = {fds322546}
}
@article{fds322547,
Author = {Yazdani, A and Dunson, DB},
Title = {A hybrid bayesian approach for genome-wide association
studies on related individuals.},
Journal = {Bioinformatics (Oxford, England)},
Volume = {31},
Number = {24},
Pages = {3890-3896},
Year = {2015},
Month = {December},
url = {http://dx.doi.org/10.1093/bioinformatics/btv496},
Abstract = {<h4>Motivation</h4>Both single marker and simultaneous
analysis face challenges in GWAS due to the large number of
markers genotyped for a small number of subjects. This large
p small n problem is particularly challenging when the trait
under investigation has low heritability.<h4>Method</h4>In
this article, we propose a two-stage approach that is a
hybrid method of single and simultaneous analysis designed
to improve genomic prediction of complex traits. In the
first stage, we use a Bayesian independent screening method
to select the most promising SNPs. In the second stage, we
rely on a hierarchical model to analyze the joint impact of
the selected markers. The model is designed to take into
account familial dependence in the different subjects, while
using local-global shrinkage priors on the marker
effects.<h4>Results</h4>We evaluate the performance in
simulation studies, and consider an application to animal
breeding data. The illustrative data analysis reveals an
encouraging result in terms of prediction performance and
computational cost.},
Doi = {10.1093/bioinformatics/btv496},
Key = {fds322547}
}
@article{fds322548,
Author = {Lock, EF and Dunson, DB},
Title = {Shared kernel Bayesian screening.},
Journal = {Biometrika},
Volume = {102},
Number = {4},
Pages = {829-842},
Year = {2015},
Month = {December},
url = {http://dx.doi.org/10.1093/biomet/asv032},
Abstract = {This article concerns testing for equality of distribution
between groups. We focus on screening variables with shared
distributional features such as common support, modes and
patterns of skewness. We propose a Bayesian testing method
using kernel mixtures, which improves performance by
borrowing information across the different variables and
groups through shared kernels and a common probability of
group differences. The inclusion of shared kernels in a
finite mixture, with Dirichlet priors on the weights, leads
to a simple framework for testing that scales well for
high-dimensional data. We provide closed asymptotic forms
for the posterior probability of equivalence in two groups
and prove consistency under model misspecification. The
method is applied to DNA methylation array data from a
breast cancer study, and compares favourably to competitors
when Type I error is estimated via permutation.},
Doi = {10.1093/biomet/asv032},
Key = {fds322548}
}
@article{fds322549,
Author = {Bhattacharya, A and Pati, D and Pillai, NS and Dunson,
DB},
Title = {Dirichlet-Laplace priors for optimal shrinkage.},
Journal = {Journal of the American Statistical Association},
Volume = {110},
Number = {512},
Pages = {1479-1490},
Year = {2015},
Month = {December},
url = {http://dx.doi.org/10.1080/01621459.2014.960967},
Abstract = {Penalized regression methods, such as <i>L</i><sub>1</sub>
regularization, are routinely used in high-dimensional
applications, and there is a rich literature on optimality
properties under sparsity assumptions. In the Bayesian
paradigm, sparsity is routinely induced through
two-component mixture priors having a probability mass at
zero, but such priors encounter daunting computational
problems in high dimensions. This has motivated continuous
shrinkage priors, which can be expressed as global-local
scale mixtures of Gaussians, facilitating computation. In
contrast to the frequentist literature, little is known
about the properties of such priors and the convergence and
concentration of the corresponding posterior distribution.
In this article, we propose a new class of Dirichlet-Laplace
priors, which possess optimal posterior concentration and
lead to efficient posterior computation. Finite sample
performance of Dirichlet-Laplace priors relative to
alternatives is assessed in simulated and real data
examples.},
Doi = {10.1080/01621459.2014.960967},
Key = {fds322549}
}
@article{fds328949,
Author = {Chabout, J and Sarkar, A and Patel, SR and Radden, T and Dunson, DB and Fisher, SE and Jarvis, ED},
Title = {A Foxp2 Mutation Implicated in Human Speech Deficits Alters
Sequencing of Ultrasonic Vocalizations in Adult Male
Mice.},
Journal = {Front Behav Neurosci},
Volume = {10},
Pages = {197},
Year = {2016},
url = {http://dx.doi.org/10.3389/fnbeh.2016.00197},
Abstract = {Development of proficient spoken language skills is
disrupted by mutations of the FOXP2 transcription factor. A
heterozygous missense mutation in the KE family causes
speech apraxia, involving difficulty producing words with
complex learned sequences of syllables. Manipulations in
songbirds have helped to elucidate the role of this gene in
vocal learning, but findings in non-human mammals have been
limited or inconclusive. Here, we performed a systematic
study of ultrasonic vocalizations (USVs) of adult male mice
carrying the KE family mutation. Using novel statistical
tools, we found that Foxp2 heterozygous mice did not have
detectable changes in USV syllable acoustic structure, but
produced shorter sequences and did not shift to more complex
syntax in social contexts where wildtype animals did.
Heterozygous mice also displayed a shift in the position of
their rudimentary laryngeal motor cortex (LMC) layer-5
neurons. Our findings indicate that although mouse USVs are
mostly innate, the underlying contributions of FoxP2 to
sequencing of vocalizations are conserved with
humans.},
Doi = {10.3389/fnbeh.2016.00197},
Key = {fds328949}
}
@article{fds322545,
Author = {Wang, X and Dunson, D and Leng, C},
Title = {No penalty no tears: Least squares in high-dimensional
linear models},
Journal = {33rd International Conference on Machine Learning, ICML
2016},
Volume = {4},
Pages = {2685-2706},
Year = {2016},
Month = {January},
ISBN = {9781510829008},
Abstract = {Ordinary least squares (OI,S) is the default method for
fitting linear models, but is not applicable for problems
with dimensionality larger than the sample size. For these
problems, we advocate the use of a generalized version of
OLS motivated by ridge regression, and propose two novel
three-step algorithms involving least squares fitting and
hard thresholding. The algorithms are methodologically
simple to understand intuitively, computationally easy to
implement efficiently, and theoretically appealing for
choosing models consistently. Numerical exercises comparing
our methods with penalization-based approaches in
simulations and data analyses illustrate the great potential
of the proposed algorithms.},
Key = {fds322545}
}
@article{fds329117,
Author = {Yang, Y and Dunson, DB},
Title = {Bayesian Conditional Tensor Factorizations for
High-Dimensional Classification.},
Journal = {Journal of the American Statistical Association},
Volume = {111},
Number = {514},
Pages = {656-669},
Publisher = {Informa UK Limited},
Year = {2016},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2015.1029129},
Abstract = {In many application areas, data are collected on a
categorical response and high-dimensional categorical
predictors, with the goals being to build a parsimonious
model for classification while doing inferences on the
important predictors. In settings such as genomics, there
can be complex interactions among the predictors. By using a
carefully-structured Tucker factorization, we define a model
that can characterize any conditional probability, while
facilitating variable selection and modeling of higher-order
interactions. Following a Bayesian approach, we propose a
Markov chain Monte Carlo algorithm for posterior computation
accommodating uncertainty in the predictors to be included.
Under near low rank assumptions, the posterior distribution
for the conditional probability is shown to achieve close to
the parametric rate of contraction even in ultra
high-dimensional settings. The methods are illustrated using
simulation examples and biomedical applications.},
Doi = {10.1080/01621459.2015.1029129},
Key = {fds329117}
}
@article{fds327031,
Author = {Wang, X and Dunson, D and Leng, C},
Title = {DECOrrelated feature space partitioning for distributed
sparse regression},
Journal = {Advances in Neural Information Processing
Systems},
Pages = {802-810},
Year = {2016},
Month = {January},
Abstract = {Fitting statistical models is computationally challenging
when the sample size or the dimension of the dataset is
huge. An attractive approach for down-scaling the problem
size is to first partition the dataset into subsets and then
fit using distributed algorithms. The dataset can be
partitioned either horizontally (in the sample space) or
vertically (in the feature space). While the majority of the
literature focuses on sample space partitioning, feature
space partitioning is more effective when p> n. Existing
methods for partitioning features, however, are either
vulnerable to high correlations or inefficient in reducing
the model dimension. In this paper, we solve these problems
through a new embarrassingly parallel framework named DECO
for distributed variable selection and parameter estimation.
In DECO, variables are first partitioned and allocated to m
distributed workers. The decorrelated subset data within
each worker are then fitted via any algorithm designed for
high-dimensional problems. We show that by incorporating the
decorrelation step, DECO can achieve consistent variable
selection and parameter estimation on each subset with
(almost) no assumptions. In addition, the convergence rate
is nearly minimax optimal for both sparse and weakly sparse
models and does NOT depend on the partition number m.
Extensive numerical experiments are provided to illustrate
the performance of the new framework.},
Key = {fds327031}
}
@article{fds344776,
Author = {Wang, Y and Canale, A and Dunson, D},
Title = {Scalable geometric density estimation},
Journal = {Proceedings of the 19th International Conference on
Artificial Intelligence and Statistics, AISTATS
2016},
Pages = {857-865},
Year = {2016},
Month = {January},
Abstract = {It is standard to assume a low-dimensional structure in
estimating a high-dimensional density. However, popular
methods, such as probabilistic principal component analysis,
scale poorly computationally. We introduce a novel empirical
Bayes method that we term geometric density estimation
(GEODE) and show that, with mild conditions and among all
d-dimensional linear subspaces, the span of the d leading
principal axes of the data maximizes the model posterior.
With these axes pre-computed using fast singular value
decomposition, GEODE easily scales to high dimensional
problems while providing uncertainty characterization. The
model is also capable of imputing missing data and
dynamically deleting redundant dimensions. Finally, we
generalize GEODE by mixing it across a dyadic clustering
tree. Both simulation studies and real world data
applications show superior performance of GEODE in terms of
robustness and computational efficiency.},
Key = {fds344776}
}
@article{fds344777,
Author = {Han, S and Liao, X and Dunson, DB and Carin, L},
Title = {Variational Gaussian copula inference},
Journal = {Proceedings of the 19th International Conference on
Artificial Intelligence and Statistics, AISTATS
2016},
Pages = {829-838},
Year = {2016},
Month = {January},
Abstract = {We utilize copulas to constitute a unified framework for
constructing and optimizing variational proposals in
hierarchical Bayesian models. For models with continuous and
non-Gaussian hidden variables, we propose a semiparametric
and automated variational Gaussian copula approach, in which
the parametric Gaussian copula family is able to preserve
multivariate posterior dependence, and the nonparametric
transformations based on Bernstein polynomials provide ample
flexibility in characterizing the univariate marginal
posteriors.},
Key = {fds344777}
}
@article{fds322541,
Author = {Zhou, J and Herring, AH and Bhattacharya, A and Olshan, AF and Dunson,
DB and National Birth Defects Prevention Study},
Title = {Nonparametric Bayes modeling for case control studies with
many predictors.},
Journal = {Biometrics},
Volume = {72},
Number = {1},
Pages = {184-192},
Year = {2016},
Month = {March},
url = {http://dx.doi.org/10.1111/biom.12411},
Abstract = {It is common in biomedical research to run case-control
studies involving high-dimensional predictors, with the main
goal being detection of the sparse subset of predictors
having a significant association with disease. Usual
analyses rely on independent screening, considering each
predictor one at a time, or in some cases on logistic
regression assuming no interactions. We propose a
fundamentally different approach based on a nonparametric
Bayesian low rank tensor factorization model for the
retrospective likelihood. Our model allows a very flexible
structure in characterizing the distribution of multivariate
variables as unknown and without any linear assumptions as
in logistic regression. Predictors are excluded only if they
have no impact on disease risk, either directly or through
interactions with other predictors. Hence, we obtain an
omnibus approach for screening for important predictors.
Computation relies on an efficient Gibbs sampler. The
methods are shown to have high power and low false discovery
rates in simulation studies, and we consider an application
to an epidemiology study of birth defects.},
Doi = {10.1111/biom.12411},
Key = {fds322541}
}
@article{fds322542,
Author = {Tang, K and Dunson, DB and Su, Z and Liu, R and Zhang, J and Dong,
J},
Title = {Subspace segmentation by dense block and sparse
representation.},
Journal = {Neural networks : the official journal of the International
Neural Network Society},
Volume = {75},
Pages = {66-76},
Year = {2016},
Month = {March},
url = {http://dx.doi.org/10.1016/j.neunet.2015.11.011},
Abstract = {Subspace segmentation is a fundamental topic in computer
vision and machine learning. However, the success of many
popular methods is about independent subspace segmentation
instead of the more flexible and realistic disjoint subspace
segmentation. Focusing on the disjoint subspaces, we provide
theoretical and empirical evidence of inferior performance
for popular algorithms such as LRR. To solve these problems,
we propose a novel dense block and sparse representation
(DBSR) for subspace segmentation and provide related
theoretical results. DBSR minimizes a combination of the
1,1-norm and maximum singular value of the representation
matrix, leading to a combination of dense block and
sparsity. We provide experimental results for synthetic and
benchmark data showing that our method can outperform the
state-of-the-art.},
Doi = {10.1016/j.neunet.2015.11.011},
Key = {fds322542}
}
@article{fds322540,
Author = {Yang, Y and Dunson, DB},
Title = {Bayesian manifold regression},
Journal = {Annals of Statistics},
Volume = {44},
Number = {2},
Pages = {876-905},
Publisher = {Institute of Mathematical Statistics},
Year = {2016},
Month = {April},
url = {http://dx.doi.org/10.1214/15-AOS1390},
Abstract = {There is increasing interest in the problem of nonparametric
regression with high-dimensional predictors. When the number
of predictors D is large, one encounters a daunting problem
in attempting to estimate aD-dimensional surface based on
limited data. Fortunately, in many applications, the support
of the data is concentrated on a d-dimensional subspace with
d ≤ D. Manifold learning attempts to estimate this
subspace. Our focus is on developing computationally
tractable and theoretically supported Bayesian nonparametric
regression methods in this context. When the subspace
corresponds to a locally-Euclidean compact Riemannian
manifold, we show that a Gaussian process regression
approach can be applied that leads to the minimax optimal
adaptive rate in estimating the regression function under
some conditions. The proposed model bypasses the need to
estimate the manifold, and can be implemented using standard
algorithms for posterior computation in Gaussian processes.
Finite sample performance is illustrated in a data analysis
example.},
Doi = {10.1214/15-AOS1390},
Key = {fds322540}
}
@article{fds322539,
Author = {Kabisa, S and Dunson, DB and Morris, JS},
Title = {Online Variational Bayes Inference for High-Dimensional
Correlated Data},
Journal = {Journal of Computational and Graphical Statistics},
Volume = {25},
Number = {2},
Pages = {426-444},
Publisher = {Informa UK Limited},
Year = {2016},
Month = {April},
url = {http://dx.doi.org/10.1080/10618600.2014.998336},
Abstract = {High-dimensional data with hundreds of thousands of
observations are becoming commonplace in many disciplines.
The analysis of such data poses many computational
challenges, especially when the observations are correlated
over time and/or across space. In this article, we propose
flexible hierarchical regression models for analyzing such
data that accommodate serial and/or spatial correlation. We
address the computational challenges involved in fitting
these models by adopting an approximate inference framework.
We develop an online variational Bayes algorithm that works
by incrementally reading the data into memory one portion at
a time. The performance of the method is assessed through
simulation studies. The methodology is applied to analyze
signal intensity in MRI images of subjects with knee
osteoarthritis, using data from the Osteoarthritis
Initiative. Supplementary materials for this article are
available online.},
Doi = {10.1080/10618600.2014.998336},
Key = {fds322539}
}
@article{fds329994,
Author = {Ovaskainen, O and Abrego, N and Halme, P and Dunson,
D},
Title = {Using latent variable models to identify large networks of
species-to-species associations at different spatial
scales},
Journal = {Methods in Ecology and Evolution},
Volume = {7},
Number = {5},
Pages = {549-555},
Publisher = {WILEY},
Editor = {Warton, D},
Year = {2016},
Month = {May},
url = {http://dx.doi.org/10.1111/2041-210X.12501},
Abstract = {We present a hierarchical latent variable model that
partitions variation in species occurrences and
co-occurrences simultaneously at multiple spatial scales. We
illustrate how the parameterized model can be used to
predict the occurrences of a species by using as predictors
not only the environmental covariates, but also the
occurrences of all other species, at all spatial scales. We
leverage recent progress in Bayesian latent variable models
to implement a computationally effective algorithm that
enables one to consider large communities and extensive
sampling schemes. We exemplify the framework with a
community of 98 fungal species sampled in c. 22 500 dead
wood units in 230 plots in 29 beech forests. The networks
identified by correlations and partial correlations were
consistent, as were networks for natural and managed
forests, but networks at different spatial scales were
dissimilar. Accounting for the occurrences of the other
species roughly doubled the predictive powers of the models
compared to accounting for environmental covariates
only.},
Doi = {10.1111/2041-210X.12501},
Key = {fds329994}
}
@article{fds322538,
Author = {Guhaniyogi, R and Dunson, DB},
Title = {Compressed Gaussian process for manifold
regression},
Journal = {Journal of Machine Learning Research},
Volume = {17},
Year = {2016},
Month = {May},
Abstract = {Nonparametric regression for large numbers of features (p)
is an increasingly important problem. If the sample size n
is massive, a common strategy is to partition the feature
space, and then separately apply simple models to each
partition set. This is not ideal when n is modest relative
to p, and we propose an alternative approach relying on
random compression of the feature vector combined with
Gaussian process regression. The proposed approach is
particularly motivated by the setting in which the response
is conditionally independent of the features given the
projection to a low dimensional manifold. Conditionally on
the random compression matrix and a smoothness parameter,
the posterior distribution for the regression surface and
posterior predictive distributions are available
analytically. Running the analysis in parallel for many
random compression matrices and smoothness parameters, model
averaging is used to combine the results. The algorithm can
be implemented rapidly even in very large p and moderately
large n nonparametric regression, has strong theoretical
justification, and is found to yield state of the art
predictive performance.},
Key = {fds322538}
}
@article{fds322536,
Author = {Kunihama, T and Herring, AH and Halpern, CT and Dunson,
DB},
Title = {Nonparametric Bayes modeling with sample survey
weights.},
Journal = {Statistics & probability letters},
Volume = {113},
Pages = {41-48},
Publisher = {Elsevier BV},
Year = {2016},
Month = {June},
url = {http://dx.doi.org/10.1016/j.spl.2016.02.009},
Abstract = {In population studies, it is standard to sample data via
designs in which the population is divided into strata, with
the different strata assigned different probabilities of
inclusion. Although there have been some proposals for
including sample survey weights into Bayesian analyses,
existing methods require complex models or ignore the
stratified design underlying the survey weights. We propose
a simple approach based on modeling the distribution of the
selected sample as a mixture, with the mixture weights
appropriately adjusted, while accounting for uncertainty in
the adjustment. We focus for simplicity on Dirichlet process
mixtures but the proposed approach can be applied more
broadly. We sketch a simple Markov chain Monte Carlo
algorithm for computation, and assess the approach via
simulations and an application.},
Doi = {10.1016/j.spl.2016.02.009},
Key = {fds322536}
}
@article{fds322537,
Author = {Rao, V and Lin, L and Dunson, DB},
Title = {Data augmentation for models based on rejection
sampling.},
Journal = {Biometrika},
Volume = {103},
Number = {2},
Pages = {319-335},
Year = {2016},
Month = {June},
url = {http://dx.doi.org/10.1093/biomet/asw005},
Abstract = {We present a data augmentation scheme to perform Markov
chain Monte Carlo inference for models where data generation
involves a rejection sampling algorithm. Our idea is a
simple scheme to instantiate the rejected proposals
preceding each data point. The resulting joint probability
over observed and rejected variables can be much simpler
than the marginal distribution over the observed variables,
which often involves intractable integrals. We consider
three problems: modelling flow-cytometry measurements
subject to truncation; the Bayesian analysis of the matrix
Langevin distribution on the Stiefel manifold; and Bayesian
inference for a nonparametric Gaussian process density
model. The latter two are instances of doubly-intractable
Markov chain Monte Carlo problems, where evaluating the
likelihood is intractable. Our experiments demonstrate
superior performance over state-of-the-art sampling
algorithms for such problems.},
Doi = {10.1093/biomet/asw005},
Key = {fds322537}
}
@article{fds329118,
Author = {Canale, A and Dunson, DB},
Title = {Multiscale bernstein polynomials for densities},
Journal = {Statistica Sinica},
Volume = {26},
Number = {3},
Pages = {1175-1195},
Publisher = {Institute of Statistical Science},
Year = {2016},
Month = {July},
url = {http://dx.doi.org/10.5705/ss.202015.0163},
Abstract = {Our focus is on constructing a multiscale nonparametric
prior for densities. The Bayes density estimation literature
is dominated by single scale methods, with the exception of
Polya trees, which favor overly-spiky densities even when
the truth is smooth. We propose a multiscale Bernstein
polynomial family of priors, which produce smooth
realizations that do not rely on hard partitioning of the
support. At each level in an infinitely-deep binary tree, we
place a beta dictionary density; within a scale the
densities are equivalent to Bernstein polynomials. Using a
stick-breaking characterization, stochastically decreasing
weights are allocated to the finer scale dictionary
elements. A slice sampler is used for posterior computation,
and properties are described. The method characterizes
densities with locally-varying smoothness, and can produce a
sequence of coarse to fine density estimates. An extension
for Bayesian testing of group differences is introduced and
applied to DNA methylation array data.},
Doi = {10.5705/ss.202015.0163},
Key = {fds329118}
}
@article{fds329993,
Author = {Hultman, R and Mague, SD and Li, Q and Katz, BM and Michel, N and Lin, L and Wang, J and David, LK and Blount, C and Chandy, R and Carlson, D and Ulrich, K and Carin, L and Dunson, D and Kumar, S and Deisseroth, K and Moore, SD and Dzirasa, K},
Title = {Dysregulation of Prefrontal Cortex-Mediated Slow-Evolving
Limbic Dynamics Drives Stress-Induced Emotional
Pathology.},
Journal = {Neuron},
Volume = {91},
Number = {2},
Pages = {439-452},
Year = {2016},
Month = {July},
url = {http://dx.doi.org/10.1016/j.neuron.2016.05.038},
Abstract = {Circuits distributed across cortico-limbic brain regions
compose the networks that mediate emotional behavior. The
prefrontal cortex (PFC) regulates ultraslow (<1 Hz)
dynamics across these networks, and PFC dysfunction is
implicated in stress-related illnesses including major
depressive disorder (MDD). To uncover the mechanism whereby
stress-induced changes in PFC circuitry alter emotional
networks to yield pathology, we used a multi-disciplinary
approach including in vivo recordings in mice and chronic
social defeat stress. Our network model, inferred using
machine learning, linked stress-induced behavioral pathology
to the capacity of PFC to synchronize amygdala and VTA
activity. Direct stimulation of PFC-amygdala circuitry with
DREADDs normalized PFC-dependent limbic synchrony in
stress-susceptible animals and restored normal behavior. In
addition to providing insights into MDD mechanisms, our
findings demonstrate an interdisciplinary approach that can
be used to identify the large-scale network changes that
underlie complex emotional pathologies and the specific
network nodes that can be used to develop targeted
interventions.},
Doi = {10.1016/j.neuron.2016.05.038},
Key = {fds329993}
}
@article{fds329116,
Author = {Li, D and Heyer, L and Jennings, VH and Smith, CA and Dunson,
DB},
Title = {Personalised estimation of a woman's most fertile
days.},
Journal = {The European journal of contraception & reproductive health
care : the official journal of the European Society of
Contraception},
Volume = {21},
Number = {4},
Pages = {323-328},
Year = {2016},
Month = {August},
url = {http://dx.doi.org/10.1080/13625187.2016.1196485},
Abstract = {<h4>Objectives</h4>We propose a new, personalised approach
of estimating a woman's most fertile days that only requires
recording the first day of menses and can use a smartphone
to convey this information to the user so that she can plan
or prevent pregnancy.<h4>Methods</h4>We performed a
retrospective analysis of two cohort studies (a North
Carolina-based study and the Early Pregnancy Study [EPS])
and a prospective multicentre trial (World Health
Organization [WHO] study). The North Carolina study
consisted of 68 sexually active women with either an
intrauterine device or tubal ligation. The EPS comprised 221
women who planned to become pregnant and had no known
fertility problems. The WHO study consisted of 706 women
from five geographically and culturally diverse settings.
Bayesian statistical methods were used to design our
proposed method, Dynamic Optimal Timing (DOT). Simulation
studies were used to estimate the cumulative pregnancy
risk.<h4>Results</h4>For the proposed method, simulation
analyses indicated a 4.4% cumulative probability of
pregnancy over 13 cycles with correct use. After a
calibration window, this method flagged between 11 and 13
days when unprotected intercourse should be avoided per
cycle. Eligible women should have cycle lengths between 20
and 40 days with a variability range less than or equal to 9
days.<h4>Conclusions</h4>DOT can easily be implemented by
computer or smartphone applications, allowing for women to
make more informed decisions about their fertility. This
approach is already incorporated into a patent-pending
system and is available for free download on iPhones and
Androids.},
Doi = {10.1080/13625187.2016.1196485},
Key = {fds329116}
}
@article{fds321837,
Author = {Yin, R and Cornelis, B and Fodor, G and Ocon, N and Dunson, D and Daubechies, I},
Title = {Removing cradle artifacts in X-ray images of
paintings},
Journal = {SIAM Journal on Imaging Sciences},
Volume = {9},
Number = {3},
Pages = {1247-1272},
Publisher = {Society for Industrial & Applied Mathematics
(SIAM)},
Year = {2016},
Month = {August},
url = {http://dx.doi.org/10.1137/15M1053554},
Abstract = {We propose an algorithm that removes the visually unpleasant
effects of cradling in X-ray images of panel paintings, with
the goal of improving the X-ray image readability by art
experts. The algorithm consists of three stages. In the
first stage the location of the cradle is detected
automatically and the grayscale inconsistency, caused by the
thickness of the cradle, is corrected. In a second stage we
use a method called morphological component analysis to
separate the X-ray image into a so-called cartoon part and a
texture part, where the latter contains mostly the wood
grain from both the panel and the cradling. The algorithm
next learns a Bayesian factor model that distinguishes
between the texture patterns that originate from the cradle
and those from other components such as the panel and/or the
painting on the panel surface, and finally uses this to
remove the textures associated with the cradle. We apply the
algorithm to a number of historically important paintings on
panel. We also show how it can be used to digitally remove
stretcher artifacts from X-rays of paintings on canvas. We
compare our results with those obtained manually by best
current practices in art conservation as well as on a ground
truth dataset, consisting of X-ray images of a painting
before and after removal of the physically attached
cradle.},
Doi = {10.1137/15M1053554},
Key = {fds321837}
}
@article{fds329114,
Author = {Zhu, H and Strawn, N and Dunson, DB},
Title = {Bayesian graphical models for multivariate functional
data},
Journal = {Journal of Machine Learning Research},
Volume = {17},
Pages = {1-27},
Year = {2016},
Month = {October},
Abstract = {Graphical models express conditional independence
relationships among variables. Although methods for
vector-valued data are well established, functional data
graphical models remain underdeveloped. By functional data,
we refer to data that are realizations of random functions
varying over a continuum (e.g., images, signals). We
introduce a notion of conditional independence between
random functions, and construct a framework for Bayesian
inference of undirected, decomposable graphs in the
multivariate functional data context. This framework is
based on extending Markov distributions and hyper Markov
laws from random variables to random processes, providing a
principled alternative to naive application of multivariate
methods to discretized functional data. Markov properties
facilitate the composition of likelihoods and priors
according to the decomposition of a graph. Our focus is on
Gaussian process graphical models using orthogonal basis
expansions. We propose a hyper-inverse-Wishart-process prior
for the covariance kernels of the infinite coeficient
sequences of the basis expansion, and establish its
existence and uniqueness. We also prove the strong hyper
Markov property and the conjugacy of this prior under a
finite rank condition of the prior kernel parameter.
Stochastic search Markov chain Monte Carlo algorithms are
developed for posterior inference, assessed through
simulations, and applied to a study of brain activity and
alcoholism.},
Key = {fds329114}
}
@article{fds329115,
Author = {Sarkar, A and Dunson, DB},
Title = {Bayesian Nonparametric Modeling of Higher Order Markov
Chains},
Journal = {Journal of the American Statistical Association},
Volume = {111},
Number = {516},
Pages = {1791-1803},
Publisher = {Informa UK Limited},
Year = {2016},
Month = {October},
url = {http://dx.doi.org/10.1080/01621459.2015.1115763},
Abstract = {We consider the problem of flexible modeling of higher order
Markov chains when an upper bound on the order of the chain
is known but the true order and nature of the serial
dependence are unknown. We propose Bayesian nonparametric
methodology based on conditional tensor factorizations,
which can characterize any transition probability with a
specified maximal order. The methodology selects the
important lags and captures higher order interactions among
the lags, while also facilitating calculation of Bayes
factors for a variety of hypotheses of interest. We design
efficient Markov chain Monte Carlo algorithms for posterior
computation, allowing for uncertainty in the set of
important lags to be included and in the nature and order of
the serial dependence. The methods are illustrated using
simulation experiments and real world applications.
Supplementary materials for this article are available
online.},
Doi = {10.1080/01621459.2015.1115763},
Key = {fds329115}
}
@article{fds329112,
Author = {Bhattacharya, A and Dunson, DB and Pati, D and Pillai,
NS},
Title = {Sub-optimality of some continuous shrinkage
priors},
Journal = {Stochastic Processes and their Applications},
Volume = {126},
Number = {12},
Pages = {3828-3842},
Publisher = {Elsevier BV},
Year = {2016},
Month = {December},
url = {http://dx.doi.org/10.1016/j.spa.2016.08.007},
Abstract = {Two-component mixture priors provide a traditional way to
induce sparsity in high-dimensional Bayes models. However,
several aspects of such a prior, including computational
complexities in high-dimensions, interpretation of exact
zeros and non-sparse posterior summaries under standard loss
functions, have motivated an amazing variety of continuous
shrinkage priors, which can be expressed as global–local
scale mixtures of Gaussians. Interestingly, we demonstrate
that many commonly used shrinkage priors, including the
Bayesian Lasso, do not have adequate posterior concentration
in high-dimensional settings.},
Doi = {10.1016/j.spa.2016.08.007},
Key = {fds329112}
}
@article{fds329113,
Author = {Durante, D and Dunson, DB},
Title = {Locally adaptive dynamic networks},
Journal = {Annals of Applied Statistics},
Volume = {10},
Number = {4},
Pages = {2203-2232},
Publisher = {Institute of Mathematical Statistics},
Year = {2016},
Month = {December},
url = {http://dx.doi.org/10.1214/16-AOAS971},
Abstract = {Our focus is on realistically modeling and forecasting
dynamic networks of face-to-face contacts among individuals.
Important aspects of such data that lead to problems with
current methods include the tendency of the contacts to move
between periods of slow and rapid changes, and the dynamic
heterogeneity in the actors’ connectivity behaviors.
Motivated by this application, we develop a novel method for
Locally Adaptive DYnamic (LADY) network inference. The
proposed model relies on a dynamic latent space
representation in which each actor’s position evolves in
time via stochastic differential equations. Using a
state-space representation for these stochastic processes
and Pólya-gamma data augmentation, we develop an efficient
MCMC algorithm for posterior inference along with tractable
procedures for online updating and forecasting of future
networks. We evaluate performance in simulation studies, and
consider an application to face-to-face contacts among
individuals in a primary school.},
Doi = {10.1214/16-AOAS971},
Key = {fds329113}
}
@article{fds327030,
Author = {Datta, J and Dunson, DB},
Title = {Bayesian inference on quasi-sparse count
data.},
Journal = {Biometrika},
Volume = {103},
Number = {4},
Pages = {971-983},
Year = {2016},
Month = {December},
url = {http://dx.doi.org/10.1093/biomet/asw053},
Abstract = {There is growing interest in analysing high-dimensional
count data, which often exhibit quasi-sparsity corresponding
to an overabundance of zeros and small nonzero counts.
Existing methods for analysing multivariate count data via
Poisson or negative binomial log-linear hierarchical models
with zero-inflation cannot flexibly adapt to quasi-sparse
settings. We develop a new class of continuous local-global
shrinkage priors tailored to quasi-sparse counts.
Theoretical properties are assessed, including flexible
posterior concentration and stronger control of false
discoveries in multiple testing. Simulation studies
demonstrate excellent small-sample properties relative to
competing methods. We use the method to detect rare
mutational hotspots in exome sequencing data and to identify
North American cities most impacted by terrorism.},
Doi = {10.1093/biomet/asw053},
Key = {fds327030}
}
@article{fds325339,
Author = {Johndrow, JE and Bhattacharya, A and Dunson, DB},
Title = {TENSOR DECOMPOSITIONS AND SPARSE LOG-LINEAR
MODELS.},
Journal = {Annals of statistics},
Volume = {45},
Number = {1},
Pages = {1-38},
Year = {2017},
Month = {January},
url = {http://dx.doi.org/10.1214/15-aos1414},
Abstract = {Contingency table analysis routinely relies on log-linear
models, with latent structure analysis providing a common
alternative. Latent structure models lead to a reduced rank
tensor factorization of the probability mass function for
multivariate categorical data, while log-linear models
achieve dimensionality reduction through sparsity. Little is
known about the relationship between these notions of
dimensionality reduction in the two paradigms. We derive
several results relating the support of a log-linear model
to nonnegative ranks of the associated probability tensor.
Motivated by these findings, we propose a new collapsed
Tucker class of tensor decompositions, which bridge existing
PARAFAC and Tucker decompositions, providing a more flexible
framework for parsimoniously characterizing multivariate
categorical data. Taking a Bayesian approach to inference,
we illustrate empirical advantages of the new
decompositions.},
Doi = {10.1214/15-aos1414},
Key = {fds325339}
}
@article{fds326570,
Author = {Lin, L and St Thomas and B and Zhu, H and Dunson, DB},
Title = {Extrinsic local regression on manifold-valued
data.},
Journal = {Journal of the American Statistical Association},
Volume = {112},
Number = {519},
Pages = {1261-1273},
Year = {2017},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2016.1208615},
Abstract = {We propose an extrinsic regression framework for modeling
data with manifold valued responses and Euclidean
predictors. Regression with manifold responses has wide
applications in shape analysis, neuroscience, medical
imaging and many other areas. Our approach embeds the
manifold where the responses lie onto a higher dimensional
Euclidean space, obtains a local regression estimate in that
space, and then projects this estimate back onto the image
of the manifold. Outside the regression setting both
intrinsic and extrinsic approaches have been proposed for
modeling i.i.d manifold-valued data. However, to our
knowledge our work is the first to take an extrinsic
approach to the regression problem. The proposed extrinsic
regression framework is general, computationally efficient
and theoretically appealing. Asymptotic distributions and
convergence rates of the extrinsic regression estimates are
derived and a large class of examples are considered
indicating the wide applicability of our
approach.},
Doi = {10.1080/01621459.2016.1208615},
Key = {fds326570}
}
@article{fds341600,
Author = {Dunson, D and Fryzlewicz, P},
Title = {Report of the editors-2016},
Journal = {Journal of the Royal Statistical Society. Series B:
Statistical Methodology},
Volume = {79},
Number = {1},
Pages = {3-4},
Year = {2017},
Month = {January},
url = {http://dx.doi.org/10.1111/rssb.12220},
Doi = {10.1111/rssb.12220},
Key = {fds341600}
}
@article{fds326219,
Author = {Dunson, DB},
Title = {Toward automated prior choice},
Journal = {Statistical Science},
Volume = {32},
Number = {1},
Pages = {41-43},
Publisher = {Institute of Mathematical Statistics},
Year = {2017},
Month = {February},
url = {http://dx.doi.org/10.1214/16-STS607},
Doi = {10.1214/16-STS607},
Key = {fds326219}
}
@article{fds333226,
Author = {Abrego, N and Dunson, D and Halme, P and Salcedo, I and Ovaskainen,
O},
Title = {Wood-inhabiting fungi with tight associations with other
species have declined as a response to forest
management},
Journal = {Oikos},
Volume = {126},
Number = {2},
Publisher = {WILEY},
Year = {2017},
Month = {February},
url = {http://dx.doi.org/10.1111/oik.03674},
Abstract = {Research on mutualistic and antagonistic networks, such as
plant–pollinator and host–parasite networks, has shown
that species interactions can influence and be influenced by
the responses of species to environmental perturbations.
Here we examine whether results obtained for directly
observable networks generalize to more complex networks in
which species interactions cannot be observed directly. As a
case study, we consider data on the occurrences of 98
wood-inhabiting fungal species in managed and natural
forests. We specifically ask if and how much the positions
of wood-inhabiting fungal species within the interaction
networks influence their responses to forest management. For
this, we utilize a joint species distribution model that
partitions variation in species occurrences among
environmental (i.e. resource availability) and biotic (i.e.
species-to-species associations) predictors. Our results
indicate that in addition to the direct loss of
resource-specialised species, forest management has indirect
effects mediated through interactive associations. In
particular, species with strong associative links to other
species are especially sensitive to forest
management.},
Doi = {10.1111/oik.03674},
Key = {fds333226}
}
@article{fds329111,
Author = {Durante, D and Paganin, S and Scarpa, B and Dunson,
DB},
Title = {Bayesian modelling of networks in complex business
intelligence problems},
Journal = {Journal of the Royal Statistical Society. Series C: Applied
Statistics},
Volume = {66},
Number = {3},
Pages = {555-580},
Publisher = {WILEY},
Year = {2017},
Month = {April},
url = {http://dx.doi.org/10.1111/rssc.12168},
Abstract = {Complex network data problems are increasingly common in
many fields of application. Our motivation is drawn from
strategic marketing studies monitoring customer choices of
specific products, along with co-subscription networks
encoding multiple-purchasing behaviour. Data are available
for several agencies within the same insurance company, and
our goal is to exploit co-subscription networks efficiently
to inform targeted advertising of cross-sell strategies to
currently monoproduct customers. We address this goal by
developing a Bayesian hierarchical model, which clusters
agencies according to common monoproduct customer choices
and co-subscription networks. Within each cluster, we
efficiently model customer behaviour via a cluster-dependent
mixture of latent eigenmodels. This formulation provides key
information on monoproduct customer choices and
multiple-purchasing behaviour within each cluster, informing
targeted cross-sell strategies. We develop simple algorithms
for tractable inference and assess performance in
simulations and an application to business
intelligence.},
Doi = {10.1111/rssc.12168},
Key = {fds329111}
}
@article{fds326037,
Author = {McKinney, M and Moffitt, AB and Gaulard, P and Travert, M and De Leval,
L and Nicolae, A and Raffeld, M and Jaffe, ES and Pittaluga, S and Xi, L and Heavican, T and Iqbal, J and Belhadj, K and Delfau-Larue, MH and Fataccioli, V and Czader, MB and Lossos, IS and Chapman-Fredricks,
JR and Richards, KL and Fedoriw, Y and Ondrejka, SL and Hsi, ED and Low, L and Weisenburger, D and Chan, WC and Mehta-Shah, N and Horwitz, S and Bernal-Mizrachi, L and Flowers, CR and Beaven, AW and Parihar, M and Baseggio, L and Parrens, M and Moreau, A and Sujobert, P and Pilichowska, M and Evens, AM and Chadburn, A and Au-Yeung, RKH and Srivastava, G and Choi, WWL and Goodlad, JR and Aurer, I and Basic-Kinda, S and Gascoyne, RD and Davis, NS and Li, G and Zhang, J and Rajagopalan, D and Reddy, A and Love, C and Levy, S and Zhuang, Y and Datta, J and Dunson, DB and Davé, SS},
Title = {The Genetic Basis of Hepatosplenic T-cell
Lymphoma.},
Journal = {Cancer Discov},
Volume = {7},
Number = {4},
Pages = {369-379},
Year = {2017},
Month = {April},
url = {http://dx.doi.org/10.1158/2159-8290.CD-16-0330},
Abstract = {Hepatosplenic T-cell lymphoma (HSTL) is a rare and lethal
lymphoma; the genetic drivers of this disease are unknown.
Through whole-exome sequencing of 68 HSTLs, we define
recurrently mutated driver genes and copy-number alterations
in the disease. Chromatin-modifying genes, including SETD2,
INO80, and ARID1B, were commonly mutated in HSTL, affecting
62% of cases. HSTLs manifest frequent mutations in STAT5B
(31%), STAT3 (9%), and PIK3CD (9%), for which there
currently exist potential targeted therapies. In addition,
we noted less frequent events in EZH2, KRAS, and TP53SETD2
was the most frequently silenced gene in HSTL. We
experimentally demonstrated that SETD2 acts as a tumor
suppressor gene. In addition, we found that mutations in
STAT5B and PIK3CD activate critical signaling pathways
important to cell survival in HSTL. Our work thus defines
the genetic landscape of HSTL and implicates gene mutations
linked to HSTL pathogenesis and potential treatment
targets.Significance: We report the first systematic
application of whole-exome sequencing to define the genetic
basis of HSTL, a rare but lethal disease. Our work defines
SETD2 as a tumor suppressor gene in HSTL and implicates
genes including INO80 and PIK3CD in the disease. Cancer
Discov; 7(4); 369-79. ©2017 AACR.See related commentary by
Yoshida and Weinstock, p. 352This article is highlighted in
the In This Issue feature, p. 339.},
Doi = {10.1158/2159-8290.CD-16-0330},
Key = {fds326037}
}
@article{fds329992,
Author = {Tikhonov, G and Abrego, N and Dunson, D and Ovaskainen,
O},
Title = {Using joint species distribution models for evaluating how
species-to-species associations depend on the environmental
context},
Journal = {Methods in Ecology and Evolution},
Volume = {8},
Number = {4},
Pages = {443-452},
Publisher = {WILEY},
Editor = {Warton, D},
Year = {2017},
Month = {April},
url = {http://dx.doi.org/10.1111/2041-210X.12723},
Abstract = {Joint species distribution models (JSDM) are increasingly
used to analyse community ecology data. Recent progress with
JSDMs has provided ecologists with new tools for estimating
species associations (residual co-occurrence patterns after
accounting for environmental niches) from large data sets,
as well as for increasing the predictive power of species
distribution models (SDMs) by accounting for such
associations. Yet, one critical limitation of JSDMs
developed thus far is that they assume constant species
associations. However, in real ecological communities, the
direction and strength of interspecific interactions are
likely to be different under different environmental
conditions. In this paper, we overcome the shortcoming of
present JSDMs by allowing species associations covary with
measured environmental covariates. To estimate
environmental-dependent species associations, we utilize a
latent variable structure, where the factor loadings are
modelled as a linear regression to environmental covariates.
We illustrate the performance of the statistical framework
with both simulated and real data. Our results show that
JSDMs perform substantially better in inferring
environmental-dependent species associations than single
SDMs, especially with sparse data. Furthermore, JSDMs
consistently overperform SDMs in terms of predictive power
for generating predictions that account for
environment-dependent biotic associations. We implemented
the statistical framework as a MATLAB package, which
includes tools both for model parameterization as well as
for post-processing of results, particularly for addressing
whether and how species associations depend on the
environmental conditions. Our statistical framework provides
a new tool for ecologists who wish to investigate from
non-manipulative observational community data the dependency
of interspecific interactions on environmental context. Our
method can be applied to answer the fundamental questions in
community ecology about how species’ interactions shift in
changing environmental conditions, as well as to predict
future changes of species’ interactions in response to
global change.},
Doi = {10.1111/2041-210X.12723},
Key = {fds329992}
}
@article{fds325977,
Author = {Lin, L and Rao, V and Dunson, D},
Title = {Bayesian nonparametric inference on the stiefel
manifold},
Journal = {Statistica Sinica},
Volume = {27},
Number = {2},
Pages = {535-553},
Publisher = {Institute of Statistical Science},
Year = {2017},
Month = {April},
url = {http://dx.doi.org/10.5705/ss.202016.0017},
Abstract = {The Stiefel manifold Vp,d is the space of all d × p
orthonormal matrices, with the d-1 hypersphere and the space
of all orthogonal matrices constituting special cases. In
modeling data lying on the Stiefel manifold, parametric
distributions such as the matrix Langevin distribution are
often used; however, model misspecification is a concern and
it is desirable to have nonparametric alternatives. Current
nonparametric methods are mainly Fŕechet-mean based. We
take a fully generative nonparametric approach, which relies
on mixing parametric kernels such as the matrix Langevin.
The proposed kernel mixtures can approximate a large class
of distributions on the Stiefel manifold, and we develop
theory showing posterior consistency. While there exists
work developing general posterior consistency results,
extending these results to this particular manifold requires
substantial new theory. Posterior inference is illustrated
on a dataset of near-Earth objects.},
Doi = {10.5705/ss.202016.0017},
Key = {fds325977}
}
@article{fds329991,
Author = {Ovaskainen, O and Tikhonov, G and Norberg, A and Guillaume Blanchet,
F and Duan, L and Dunson, D and Roslin, T and Abrego,
N},
Title = {How to make more out of community data? A conceptual
framework and its implementation as models and
software.},
Journal = {Ecology letters},
Volume = {20},
Number = {5},
Pages = {561-576},
Year = {2017},
Month = {May},
url = {http://dx.doi.org/10.1111/ele.12757},
Abstract = {Community ecology aims to understand what factors determine
the assembly and dynamics of species assemblages at
different spatiotemporal scales. To facilitate the
integration between conceptual and statistical approaches in
community ecology, we propose Hierarchical Modelling of
Species Communities (HMSC) as a general, flexible framework
for modern analysis of community data. While
non-manipulative data allow for only correlative and not
causal inference, this framework facilitates the formulation
of data-driven hypotheses regarding the processes that
structure communities. We model environmental filtering by
variation and covariation in the responses of individual
species to the characteristics of their environment, with
potential contingencies on species traits and phylogenetic
relationships. We capture biotic assembly rules by
species-to-species association matrices, which may be
estimated at multiple spatial or temporal scales. We
operationalise the HMSC framework as a hierarchical Bayesian
joint species distribution model, and implement it as R- and
Matlab-packages which enable computationally efficient
analyses of large data sets. Armed with this tool, community
ecologists can make sense of many types of data, including
spatially explicit data and time-series data. We illustrate
the use of this framework through a series of diverse
ecological examples.},
Doi = {10.1111/ele.12757},
Key = {fds329991}
}
@article{fds329990,
Author = {Ovaskainen, O and Tikhonov, G and Dunson, D and Grøtan, V and Engen, S and Sæther, B-E and Abrego, N},
Title = {How are species interactions structured in species-rich
communities? A new method for analysing time-series
data.},
Journal = {Proceedings. Biological sciences},
Volume = {284},
Number = {1855},
Pages = {20170768},
Year = {2017},
Month = {May},
url = {http://dx.doi.org/10.1098/rspb.2017.0768},
Abstract = {Estimation of intra- and interspecific interactions from
time-series on species-rich communities is challenging due
to the high number of potentially interacting species pairs.
The previously proposed sparse interactions model overcomes
this challenge by assuming that most species pairs do not
interact. We propose an alternative model that does not
assume that any of the interactions are necessarily zero,
but summarizes the influences of individual species by a
small number of community-level drivers. The community-level
drivers are defined as linear combinations of species
abundances, and they may thus represent e.g. the total
abundance of all species or the relative proportions of
different functional groups. We show with simulated and real
data how our approach can be used to compare different
hypotheses on community structure. In an empirical example
using aquatic microorganisms, the community-level drivers
model clearly outperformed the sparse interactions model in
predicting independent validation data.},
Doi = {10.1098/rspb.2017.0768},
Key = {fds329990}
}
@article{fds327282,
Author = {Moffitt, AB and Ondrejka, SL and McKinney, M and Rempel, RE and Goodlad,
JR and Teh, CH and Leppa, S and Mannisto, S and Kovanen, PE and Tse, E and Au-Yeung, RKH and Kwong, Y-L and Srivastava, G and Iqbal, J and Yu, J and Naresh, K and Villa, D and Gascoyne, RD and Said, J and Czader, MB and Chadburn, A and Richards, KL and Rajagopalan, D and Davis, NS and Smith,
EC and Palus, BC and Tzeng, TJ and Healy, JA and Lugar, PL and Datta, J and Love, C and Levy, S and Dunson, DB and Zhuang, Y and Hsi, ED and Dave,
SS},
Title = {Enteropathy-associated T cell lymphoma subtypes are
characterized by loss of function of SETD2.},
Journal = {J Exp Med},
Volume = {214},
Number = {5},
Pages = {1371-1386},
Year = {2017},
Month = {May},
url = {http://dx.doi.org/10.1084/jem.20160894},
Abstract = {Enteropathy-associated T cell lymphoma (EATL) is a lethal,
and the most common, neoplastic complication of celiac
disease. Here, we defined the genetic landscape of EATL
through whole-exome sequencing of 69 EATL tumors. SETD2 was
the most frequently silenced gene in EATL (32% of cases).
The JAK-STAT pathway was the most frequently mutated
pathway, with frequent mutations in STAT5B as well as JAK1,
JAK3, STAT3, and SOCS1 We also identified mutations in KRAS,
TP53, and TERT Type I EATL and type II EATL (monomorphic
epitheliotropic intestinal T cell lymphoma) had highly
overlapping genetic alterations indicating shared mechanisms
underlying their pathogenesis. We modeled the effects of
SETD2 loss in vivo by developing a T cell-specific knockout
mouse. These mice manifested an expansion of γδ T cells,
indicating novel roles for SETD2 in T cell development and
lymphomagenesis. Our data render the most comprehensive
genetic portrait yet of this uncommon but lethal disease and
may inform future classification schemes.},
Doi = {10.1084/jem.20160894},
Key = {fds327282}
}
@article{fds343492,
Author = {Rao, V and Adams, RP and Dunson, DD},
Title = {Bayesian inference for Matérn repulsive
processes},
Journal = {Journal of the Royal Statistical Society. Series B:
Statistical Methodology},
Volume = {79},
Number = {3},
Pages = {877-897},
Year = {2017},
Month = {June},
url = {http://dx.doi.org/10.1111/rssb.12198},
Abstract = {In many applications involving point pattern data, the
Poisson process assumption is unrealistic, with the data
exhibiting a more regular spread. Such repulsion between
events is exhibited by trees for example, because of
competition for light and nutrients. Other examples include
the locations of biological cells and cities, and the times
of neuronal spikes. Given the many applications of repulsive
point processes, there is a surprisingly limited literature
developing flexible, realistic and interpretable models, as
well as efficient inferential methods. We address this gap
by developing a modelling framework around the Matérn type
III repulsive process. We consider some extensions of the
original Matérn type III process for both the homogeneous
and the inhomogeneous cases. We also derive the probability
density of this generalized Matérn process, allowing us to
characterize the conditional distribution of the various
latent variables, and leading to a novel and efficient
Markov chain Monte Carlo algorithm. We apply our ideas to
data sets of spatial locations of trees, nerve fibre cells
and Greyhound bus stations.},
Doi = {10.1111/rssb.12198},
Key = {fds343492}
}
@article{fds326919,
Author = {Schaich Borg and J and Srivastava, S and Lin, L and Heffner, J and Dunson,
D and Dzirasa, K and de Lecea, L},
Title = {Rat intersubjective decisions are encoded by
frequency-specific oscillatory contexts.},
Journal = {Brain Behav},
Volume = {7},
Number = {6},
Pages = {e00710},
Year = {2017},
Month = {June},
url = {http://dx.doi.org/10.1002/brb3.710},
Abstract = {INTRODUCTION: It is unknown how the brain coordinates
decisions to withstand personal costs in order to prevent
other individuals' distress. Here we test whether local
field potential (LFP) oscillations between brain regions
create "neural contexts" that select specific brain
functions and encode the outcomes of these types of
intersubjective decisions. METHODS: Rats participated in an
"Intersubjective Avoidance Test" (IAT) that tested rats'
willingness to enter an innately aversive chamber to prevent
another rat from getting shocked. c-Fos immunoreactivity was
used to screen for brain regions involved in IAT
performance. Multi-site local field potential (LFP)
recordings were collected simultaneously and bilaterally
from five brain regions implicated in the c-Fos studies
while rats made decisions in the IAT. Local field potential
recordings were analyzed using an elastic net penalized
regression framework. RESULTS: Rats voluntarily entered an
innately aversive chamber to prevent another rat from
getting shocked, and c-Fos immunoreactivity in brain regions
known to be involved in human empathy-including the anterior
cingulate, insula, orbital frontal cortex, and
amygdala-correlated with the magnitude of "intersubjective
avoidance" each rat displayed. Local field potential
recordings revealed that optimal accounts of rats'
performance in the task require specific frequencies of LFP
oscillations between brain regions in addition to specific
frequencies of LFP oscillations within brain regions. Alpha
and low gamma coherence between spatially distributed brain
regions predicts more intersubjective avoidance, while theta
and high gamma coherence between a separate subset of brain
regions predicts less intersubjective avoidance. Phase
relationship analyses indicated that choice-relevant
coherence in the alpha range reflects information passed
from the amygdala to cortical structures, while coherence in
the theta range reflects information passed in the reverse
direction. CONCLUSION: These results indicate that the
frequency-specific "neural context" surrounding brain
regions involved in social cognition encodes outcomes of
decisions that affect others, above and beyond signals from
any set of brain regions in isolation.},
Doi = {10.1002/brb3.710},
Key = {fds326919}
}
@article{fds327028,
Author = {Zhu, B and Dunson, DB},
Title = {Bayesian functional data modeling for heterogeneous
volatility},
Journal = {Bayesian Analysis},
Volume = {12},
Number = {2},
Pages = {335-350},
Publisher = {Institute of Mathematical Statistics},
Year = {2017},
Month = {June},
url = {http://dx.doi.org/10.1214/16-BA1004},
Abstract = {Although there are many methods for functional data
analysis, less emphasis is put on characterizing variability
among volatilities of individual functions. In particular,
certain individuals exhibit erratic swings in their
trajectory while other individuals have more stable
trajectories. There is evidence of such volatility
heterogeneity in blood pressure trajectories during
pregnancy, for example, and reason to suspect that
volatility is a biologically important feature. Most
functional data analysis models implicitly assume similar or
identical smoothness of the individual functions, and hence
can lead to misleading inferences on volatility and an
inadequate representation of the functions. We propose a
novel class of functional data analysis models characterized
using hierarchical stochastic differential equations. We
model the derivatives of a mean function and deviation
functions using Gaussian processes, while also allowing
covariate dependence including on the volatilities of the
deviation functions. Following a Bayesian approach to
inference, a Markov chain Monte Carlo algorithm is used for
posterior computation. The methods are tested on simulated
data and applied to blood pressure trajectories during
pregnancy.},
Doi = {10.1214/16-BA1004},
Key = {fds327028}
}
@article{fds327029,
Author = {Wang, L and Durante, D and Jung, RE and Dunson, DB},
Title = {Bayesian network-response regression.},
Journal = {Bioinformatics (Oxford, England)},
Volume = {33},
Number = {12},
Pages = {1859-1866},
Year = {2017},
Month = {June},
url = {http://dx.doi.org/10.1093/bioinformatics/btx050},
Abstract = {<h4>Motivation</h4>There is increasing interest in learning
how human brain networks vary as a function of a continuous
trait, but flexible and efficient procedures to accomplish
this goal are limited. We develop a Bayesian semiparametric
model, which combines low-rank factorizations and flexible
Gaussian process priors to learn changes in the conditional
expectation of a network-valued random variable across the
values of a continuous predictor, while including
subject-specific random effects.<h4>Results</h4>The
formulation leads to a general framework for inference on
changes in brain network structures across human traits,
facilitating borrowing of information and coherently
characterizing uncertainty. We provide an efficient Gibbs
sampler for posterior computation along with simple
procedures for inference, prediction and goodness-of-fit
assessments. The model is applied to learn how human brain
networks vary across individuals with different intelligence
scores. Results provide interesting insights on the
association between intelligence and brain connectivity,
while demonstrating good predictive performance.<h4>Availability
and implementation</h4>Source code implemented in R and data
are available at https://github.com/wangronglu/BNRR.<h4>Contact</h4>rl.wang@duke.edu.<h4>Supplementary
information</h4>Supplementary data are available at
Bioinformatics online.},
Doi = {10.1093/bioinformatics/btx050},
Key = {fds327029}
}
@article{fds329353,
Author = {Guhaniyogi, R and Qamar, S and Dunson, DB},
Title = {Bayesian tensor regression},
Journal = {Journal of Machine Learning Research},
Volume = {18},
Pages = {1-31},
Year = {2017},
Month = {August},
Abstract = {We propose a Bayesian approach to regression with a scalar
response on vector and tensor covariates. Vectorization of
the tensor prior to analysis fails to exploit the structure,
often leading to poor estimation and predictive performance.
We introduce a novel class of multiway shrinkage priors for
tensor coefficients in the regression setting and present
posterior consistency results under mild conditions. A
computationally efficient Markov chain Monte Carlo algorithm
is developed for posterior computation. Simulation studies
illustrate substantial gains over existing tensor regression
methods in terms of estimation and parameter inference. Our
approach is further illustrated in a neuroimaging
application.},
Key = {fds329353}
}
@article{fds329109,
Author = {Li, C and Srivastava, S and Dunson, DB},
Title = {Simple, scalable and accurate posterior interval
estimation},
Journal = {Biometrika},
Volume = {104},
Number = {3},
Pages = {665-680},
Publisher = {Oxford University Press (OUP)},
Year = {2017},
Month = {September},
url = {http://dx.doi.org/10.1093/biomet/asx033},
Abstract = {Standard posterior sampling algorithms, such as Markov chain
Monte Carlo procedures, face major challenges in scaling up
to massive datasets. We propose a simple and general
posterior interval estimation algorithm to rapidly and
accurately estimate quantiles of the posterior distributions
for one-dimensional functionals. Our algorithm runs Markov
chain Monte Carlo in parallel for subsets of the data, and
then averages quantiles estimated from each subset. We
provide strong theoretical guarantees and show that the
credible intervals from our algorithm asymptotically
approximate those from the full posterior in the leading
parametric order. Our algorithm has a better balance of
accuracy and efficiency than its competitors across a
variety of simulations and a real-data example.},
Doi = {10.1093/biomet/asx033},
Key = {fds329109}
}
@article{fds323700,
Author = {Lock, EF and Dunson, DB},
Title = {Bayesian genome- and epigenome-wide association studies with
gene level dependence.},
Journal = {Biometrics},
Volume = {73},
Number = {3},
Pages = {1018-1028},
Year = {2017},
Month = {September},
url = {http://dx.doi.org/10.1111/biom.12649},
Abstract = {High-throughput genetic and epigenetic data are often
screened for associations with an observed phenotype. For
example, one may wish to test hundreds of thousands of
genetic variants, or DNA methylation sites, for an
association with disease status. These genomic variables can
naturally be grouped by the gene they encode, among other
criteria. However, standard practice in such applications is
independent screening with a universal correction for
multiplicity. We propose a Bayesian approach in which the
prior probability of an association for a given genomic
variable depends on its gene, and the gene-specific
probabilities are modeled nonparametrically. This
hierarchical model allows for appropriate gene and
genome-wide multiplicity adjustments, and can be
incorporated into a variety of Bayesian association
screening methodologies with negligible increase in
computational complexity. We describe an application to
screening for differences in DNA methylation between lower
grade glioma and glioblastoma multiforme tumor samples from
The Cancer Genome Atlas. Software is available via the
package BayesianScreening for R: github.com/lockEF/BayesianScreening.},
Doi = {10.1111/biom.12649},
Key = {fds323700}
}
@article{fds329110,
Author = {Srivastava, S and Engelhardt, BE and Dunson, DB},
Title = {Expandable factor analysis.},
Journal = {Biometrika},
Volume = {104},
Number = {3},
Pages = {649-663},
Year = {2017},
Month = {September},
url = {http://dx.doi.org/10.1093/biomet/asx030},
Abstract = {Bayesian sparse factor models have proven useful for
characterizing dependence in multivariate data, but scaling
computation to large numbers of samples and dimensions is
problematic. We propose expandable factor analysis for
scalable inference in factor models when the number of
factors is unknown. The method relies on a continuous
shrinkage prior for efficient maximum a posteriori
estimation of a low-rank and sparse loadings matrix. The
structure of the prior leads to an estimation algorithm that
accommodates uncertainty in the number of factors. We
propose an information criterion to select the
hyperparameters of the prior. Expandable factor analysis has
better false discovery rates and true positive rates than
its competitors across diverse simulation settings. We apply
the proposed approach to a gene expression study of ageing
in mice, demonstrating superior results relative to four
competing methods.},
Doi = {10.1093/biomet/asx030},
Key = {fds329110}
}
@article{fds332379,
Author = {Durante, D and Dunson, DB and Vogelstein, JT},
Title = {Rejoinder: Nonparametric Bayes Modeling of Populations of
Networks},
Journal = {Journal of the American Statistical Association},
Volume = {112},
Number = {520},
Pages = {1547-1552},
Publisher = {Informa UK Limited},
Year = {2017},
Month = {October},
url = {http://dx.doi.org/10.1080/01621459.2017.1395643},
Doi = {10.1080/01621459.2017.1395643},
Key = {fds332379}
}
@article{fds327388,
Author = {Durante, D and Dunson, DB and Vogelstein, JT},
Title = {Nonparametric Bayes Modeling of Populations of
Networks},
Journal = {Journal of the American Statistical Association},
Volume = {112},
Number = {520},
Pages = {1516-1530},
Publisher = {Informa UK Limited},
Year = {2017},
Month = {October},
url = {http://dx.doi.org/10.1080/01621459.2016.1219260},
Abstract = {Replicated network data are increasingly available in many
research fields. For example, in connectomic applications,
interconnections among brain regions are collected for each
patient under study, motivating statistical models which can
flexibly characterize the probabilistic generative mechanism
underlying these network-valued data. Available models for a
single network are not designed specifically for inference
on the entire probability mass function of a network-valued
random variable and therefore lack flexibility in
characterizing the distribution of relevant topological
structures. We propose a flexible Bayesian nonparametric
approach for modeling the population distribution of
network-valued data. The joint distribution of the edges is
defined via a mixture model that reduces dimensionality and
efficiently incorporates network information within each
mixture component by leveraging latent space
representations. The formulation leads to an efficient Gibbs
sampler and provides simple and coherent strategies for
inference and goodness-of-fit assessments. We provide
theoretical results on the flexibility of our model and
illustrate improved performance—compared to
state-of-the-art models—in simulations and application to
human brain networks. Supplementary materials for this
article are available online.},
Doi = {10.1080/01621459.2016.1219260},
Key = {fds327388}
}
@article{fds329352,
Author = {Reddy, A and Zhang, J and Davis, NS and Moffitt, AB and Love, CL and Waldrop, A and Leppa, S and Pasanen, A and Meriranta, L and Karjalainen-Lindsberg, M-L and Nørgaard, P and Pedersen, M and Gang,
AO and Høgdall, E and Heavican, TB and Lone, W and Iqbal, J and Qin, Q and Li, G and Kim, SY and Healy, J and Richards, KL and Fedoriw, Y and Bernal-Mizrachi, L and Koff, JL and Staton, AD and Flowers, CR and Paltiel, O and Goldschmidt, N and Calaminici, M and Clear, A and Gribben, J and Nguyen, E and Czader, MB and Ondrejka, SL and Collie, A and Hsi, ED and Tse, E and Au-Yeung, RKH and Kwong, Y-L and Srivastava, G and Choi, WWL and Evens, AM and Pilichowska, M and Sengar, M and Reddy, N and Li, S and Chadburn, A and Gordon, LI and Jaffe, ES and Levy, S and Rempel,
R and Tzeng, T and Happ, LE and Dave, T and Rajagopalan, D and Datta, J and Dunson, DB and Dave, SS},
Title = {Genetic and Functional Drivers of Diffuse Large B Cell
Lymphoma.},
Journal = {Cell},
Volume = {171},
Number = {2},
Pages = {481-494.e15},
Year = {2017},
Month = {October},
url = {http://dx.doi.org/10.1016/j.cell.2017.09.027},
Abstract = {Diffuse large B cell lymphoma (DLBCL) is the most common
form of blood cancer and is characterized by a striking
degree of genetic and clinical heterogeneity. This
heterogeneity poses a major barrier to understanding the
genetic basis of the disease and its response to therapy.
Here, we performed an integrative analysis of whole-exome
sequencing and transcriptome sequencing in a cohort of 1,001
DLBCL patients to comprehensively define the landscape of
150 genetic drivers of the disease. We characterized the
functional impact of these genes using an unbiased CRISPR
screen of DLBCL cell lines to define oncogenes that promote
cell growth. A prognostic model comprising these genetic
alterations outperformed current established methods: cell
of origin, the International Prognostic Index comprising
clinical variables, and dual MYC and BCL2 expression. These
results comprehensively define the genetic drivers and their
functional roles in DLBCL to identify new therapeutic
opportunities in the disease.},
Doi = {10.1016/j.cell.2017.09.027},
Key = {fds329352}
}
@article{fds332886,
Author = {Shang, Y and Dunson, D and Song, JS},
Title = {Exploiting big data in logistics risk assessment via
Bayesian nonparametrics},
Journal = {Operations Research},
Volume = {65},
Number = {6},
Pages = {1574-1588},
Publisher = {Institute for Operations Research and the Management
Sciences (INFORMS)},
Year = {2017},
Month = {November},
url = {http://dx.doi.org/10.1287/opre.2017.1612},
Abstract = {In cargo logistics, a key performance measure is transport
risk, defined as the deviation of the actual arrival time
from the planned arrival time. Neither earliness nor
tardiness is desirable for customer and freight forwarders.
In this paper, we investigate ways to assess and forecast
transport risks using a half-year of air cargo data,
provided by a leading forwarder on 1,336 routes served by 20
airlines. Interestingly, our preliminary data analysis shows
a strong multimodal feature in the transport risks, driven
by unobserved events, such as cargo missing flights. To
accommodate this feature, we introduce a Bayesian
nonparametric model-the probit stick-breaking process
mixture model-for flexible estimation of the conditional
(i.e., state-dependent) density function of transport risk.
We demonstrate that using alternative methods can lead to
misleading inferences. Our model provides a tool for the
forwarder to offer customized price and service quotes. It
can also generate baseline airline performance to enable
fair supplier evaluation. Furthermore, the method allows us
to separate recurrent risks from disruption risks. This is
important, because hedging strategies for these two kinds of
risks are often drastically di erent.},
Doi = {10.1287/opre.2017.1612},
Key = {fds332886}
}
@article{fds332378,
Author = {Minsker, S and Srivastava, S and Lin, L and Dunson,
DB},
Title = {Robust and scalable bayes via a median of subset posterior
measures},
Journal = {Journal of Machine Learning Research},
Volume = {18},
Pages = {1-40},
Year = {2017},
Month = {December},
Abstract = {We propose a novel approach to Bayesian analysis that is
provably robust to outliers in the data and often has
computational advantages over standard methods. Our
technique is based on splitting the data into
non-overlapping subgroups, evaluating the posterior
distribution given each independent subgroup, and then
combining the resulting measures. The main novelty of our
approach is the proposed aggregation step, which is based on
the evaluation of a median in the space of probability
measures equipped with a suitable collection of distances
that can be quickly and efficiently evaluated in practice.
We present both theoretical and numerical evidence
illustrating the improvements achieved by our
method.},
Key = {fds332378}
}
@article{fds332363,
Author = {Wheeler, MW and Dunson, DB and Herring, AH},
Title = {Bayesian local extremum splines},
Journal = {Biometrika},
Volume = {104},
Number = {4},
Pages = {939-952},
Publisher = {Oxford University Press (OUP)},
Year = {2017},
Month = {December},
url = {http://dx.doi.org/10.1093/biomet/asx039},
Abstract = {We consider shape-restricted nonparametric regression on a
closed set $$\mathcal{X} \subset \mathbb{R},$$ where it is
reasonable to assume that the function has no more than
$$H$$ local extrema interior to $$\mathcal{X}$$. Following a
Bayesian approach we develop a nonparametric prior over a
novel class of local extremum splines. This approach is
shown to be consistent when modelling any continuously
differentiable function within the class considered, and we
use itto develop methods for testing hypotheses on the shape
of the curve. Sampling algorithms are developed, and the
method is applied in simulation studies and data examples
where the shape of the curve is of interest.},
Doi = {10.1093/biomet/asx039},
Key = {fds332363}
}
@article{fds335796,
Author = {Bertrán, MA and Martínez, NL and Wang, Y and Dunson, D and Sapiro, G and Ringach, D},
Title = {Active learning of cortical connectivity from two-photon
imaging data.},
Journal = {PloS one},
Volume = {13},
Number = {5},
Pages = {e0196527},
Year = {2018},
Month = {January},
url = {http://dx.doi.org/10.1371/journal.pone.0196527},
Abstract = {Understanding how groups of neurons interact within a
network is a fundamental question in system neuroscience.
Instead of passively observing the ongoing activity of a
network, we can typically perturb its activity, either by
external sensory stimulation or directly via techniques such
as two-photon optogenetics. A natural question is how to use
such perturbations to identify the connectivity of the
network efficiently. Here we introduce a method to infer
sparse connectivity graphs from in-vivo, two-photon imaging
of population activity in response to external stimuli. A
novel aspect of the work is the introduction of a
recommended distribution, incrementally learned from the
data, to optimally refine the inferred network. Unlike
existing system identification techniques, this "active
learning" method automatically focuses its attention on key
undiscovered areas of the network, instead of targeting
global uncertainty indicators like parameter variance. We
show how active learning leads to faster inference while, at
the same time, provides confidence intervals for the network
parameters. We present simulations on artificial small-world
networks to validate the methods and apply the method to
real data. Analysis of frequency of motifs recovered show
that cortical networks are consistent with a small-world
topology model.},
Doi = {10.1371/journal.pone.0196527},
Key = {fds335796}
}
@article{fds340937,
Author = {Zhao, S and Engelhardt, BE and Mukherjee, S and Dunson,
DB},
Title = {Fast Moment Estimation for Generalized Latent Dirichlet
Models.},
Journal = {Journal of the American Statistical Association},
Volume = {113},
Number = {524},
Pages = {1528-1540},
Year = {2018},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2017.1341839},
Abstract = {We develop a generalized method of moments (GMM) approach
for fast parameter estimation in a new class of Dirichlet
latent variable models with mixed data types. Parameter
estimation via GMM has computational and statistical
advantages over alternative methods, such as expectation
maximization, variational inference, and Markov chain Monte
Carlo. A key computational advantage of our method, Moment
Estimation for latent Dirichlet models (MELD), is that
parameter estimation does not require instantiation of the
latent variables. Moreover, performance is agnostic to
distributional assumptions of the observations. We derive
population moment conditions after marginalizing out the
sample-specific Dirichlet latent variables. The moment
conditions only depend on component mean parameters. We
illustrate the utility of our approach on simulated data,
comparing results from MELD to alternative methods, and we
show the promise of our approach through the application to
several datasets. Supplementary materials for this article
are available online.},
Doi = {10.1080/01621459.2017.1341839},
Key = {fds340937}
}
@article{fds340385,
Author = {Durante, D and Dunson, DB},
Title = {Bayesian inference and testing of group differences in brain
networks},
Journal = {Bayesian Analysis},
Volume = {13},
Number = {1},
Pages = {29-58},
Publisher = {Institute of Mathematical Statistics},
Year = {2018},
Month = {January},
url = {http://dx.doi.org/10.1214/16-BA1030},
Abstract = {Network data are increasingly collected along with other
variables of interest. Our motivation is drawn from
neurophysiology studies measuring brain connectivity
networks for a sample of individuals along with their
membership to a low or high creative reasoning group. It is
of paramount importance to develop statistical methods for
testing of global and local changes in the structural
interconnections among brain regions across groups. We
develop a general Bayesian procedure for inference and
testing of group differences in the network structure, which
relies on a nonparametric representation for the conditional
probability mass function associated with a network-valued
random variable. By leveraging a mixture of low-rank
factorizations, we allow simple global and local hypothesis
testing adjusting for multiplicity. An efficient Gibbs
sampler is defined for posterior computation. We provide
theoretical results on the flexibility of the model and
assess testing performance in simulations. The approach is
applied to provide novel insights on the relationships
between human brain networks and creativity.},
Doi = {10.1214/16-BA1030},
Key = {fds340385}
}
@article{fds362761,
Author = {Durante, D and Dunson, DB},
Title = {Supplementary Material For “Bayesian Inference And Testing
Of Group Differences In Brain Networks”},
Journal = {Bayesian Analysis},
Volume = {13},
Number = {1},
Pages = {1-2},
Year = {2018},
Month = {January},
url = {http://dx.doi.org/10.1214/16-BA1030SUPP},
Abstract = {The supplementary materials contain proofs of Propositions
1, 2 and 3, providing the-oretical support for the
methodology developed in the article “Bayesian Inference
and Testing of Group Differences in Brain
Networks},
Doi = {10.1214/16-BA1030SUPP},
Key = {fds362761}
}
@article{fds332810,
Author = {van den Boom, W and Schroeder, RA and Manning, MW and Setji, TL and Fiestan, G-O and Dunson, DB},
Title = {Effect of A1C and Glucose on Postoperative Mortality in
Noncardiac and Cardiac Surgeries.},
Journal = {Diabetes Care},
Volume = {41},
Number = {4},
Pages = {782-788},
Year = {2018},
Month = {April},
url = {http://dx.doi.org/10.2337/dc17-2232},
Abstract = {OBJECTIVE: Hemoglobin A1c (A1C) is used in assessment of
patients for elective surgeries because hyperglycemia
increases risk of adverse events. However, the interplay of
A1C, glucose, and surgical outcomes remains unclarified,
with often only two of these three factors considered
simultaneously. We assessed the association of preoperative
A1C with perioperative glucose control and their
relationship with 30-day mortality. RESEARCH DESIGN AND
METHODS: Retrospective analysis on 431,480 surgeries within
the Duke University Health System determined the association
of preoperative A1C with perioperative glucose (averaged
over the first 3 postoperative days) and 30-day mortality
among 6,684 noncardiac and 6,393 cardiac surgeries with A1C
and glucose measurements. A generalized additive model was
used, enabling nonlinear relationships. RESULTS: A1C and
glucose were strongly associated. Glucose and mortality were
positively associated for noncardiac cases: 1.0% mortality
at mean glucose of 100 mg/dL and 1.6% at mean glucose of 200
mg/dL. For cardiac procedures, there was a striking U-shaped
relationship between glucose and mortality, ranging from
4.5% at 100 mg/dL to a nadir of 1.5% at 140 mg/dL and rising
again to 6.9% at 200 mg/dL. A1C and 30-day mortality were
not associated when controlling for glucose in noncardiac or
cardiac procedures. CONCLUSIONS: Although A1C is positively
associated with perioperative glucose, it is not associated
with increased 30-day mortality after controlling for
glucose. Perioperative glucose predicts 30-day mortality,
linearly in noncardiac and nonlinearly in cardiac
procedures. This confirms that perioperative glucose control
is related to surgical outcomes but that A1C, reflecting
antecedent glycemia, is a less useful predictor.},
Doi = {10.2337/dc17-2232},
Key = {fds332810}
}
@article{fds333225,
Author = {Dunson, DB},
Title = {Statistics in the big data era: Failures of the
machine},
Journal = {Statistics and Probability Letters},
Volume = {136},
Pages = {4-9},
Publisher = {Elsevier BV},
Year = {2018},
Month = {May},
url = {http://dx.doi.org/10.1016/j.spl.2018.02.028},
Abstract = {There is vast interest in automated methods for complex data
analysis. However, there is a lack of consideration of (1)
interpretability, (2) uncertainty quantification, (3)
applications with limited training data, and (4) selection
bias. Statistical methods can achieve (1)-(4) with a change
in focus.},
Doi = {10.1016/j.spl.2018.02.028},
Key = {fds333225}
}
@article{fds333512,
Author = {Zhang, Z and Descoteaux, M and Zhang, J and Girard, G and Chamberland,
M and Dunson, D and Srivastava, A and Zhu, H},
Title = {Mapping population-based structural connectomes.},
Journal = {NeuroImage},
Volume = {172},
Pages = {130-145},
Year = {2018},
Month = {May},
url = {http://dx.doi.org/10.1016/j.neuroimage.2017.12.064},
Abstract = {Advances in understanding the structural connectomes of
human brain require improved approaches for the
construction, comparison and integration of high-dimensional
whole-brain tractography data from a large number of
individuals. This article develops a population-based
structural connectome (PSC) mapping framework to address
these challenges. PSC simultaneously characterizes a large
number of white matter bundles within and across different
subjects by registering different subjects' brains based on
coarse cortical parcellations, compressing the bundles of
each connection, and extracting novel connection weights. A
robust tractography algorithm and streamline post-processing
techniques, including dilation of gray matter regions,
streamline cutting, and outlier streamline removal are
applied to improve the robustness of the extracted
structural connectomes. The developed PSC framework can be
used to reproducibly extract binary networks, weighted
networks and streamline-based brain connectomes. We apply
the PSC to Human Connectome Project data to illustrate its
application in characterizing normal variations and
heritability of structural connectomes in healthy
subjects.},
Doi = {10.1016/j.neuroimage.2017.12.064},
Key = {fds333512}
}
@article{fds335795,
Author = {Johndrow, JE and Lum, K and Dunson, DB},
Title = {Theoretical limits of microclustering for record
linkage.},
Journal = {Biometrika},
Volume = {105},
Number = {2},
Pages = {431-446},
Year = {2018},
Month = {June},
url = {http://dx.doi.org/10.1093/biomet/asy003},
Abstract = {There has been substantial recent interest in record
linkage, where one attempts to group the records pertaining
to the same entities from one or more large databases that
lack unique identifiers. This can be viewed as a type of
microclustering, with few observations per cluster and a
very large number of clusters. We show that the problem is
fundamentally hard from a theoretical perspective and, even
in idealized cases, accurate entity resolution is
effectively impossible unless the number of entities is
small relative to the number of records and/or the
separation between records from different entities is
extremely large. These results suggest conservatism in
interpretation of the results of record linkage, support
collection of additional data to more accurately
disambiguate the entities, and motivate a focus on coarser
inference. For example, results from a simulation study
suggest that sometimes one may obtain accurate results for
population size estimation even when fine-scale entity
resolution is inaccurate.},
Doi = {10.1093/biomet/asy003},
Key = {fds335795}
}
@article{fds335794,
Author = {Shterev, ID and Dunson, DB and Chan, C and Sempowski,
GD},
Title = {Bayesian Multi-Plate High-Throughput Screening of
Compounds.},
Journal = {Sci Rep},
Volume = {8},
Number = {1},
Pages = {9551},
Year = {2018},
Month = {June},
url = {http://dx.doi.org/10.1038/s41598-018-27531-w},
Abstract = {High-throughput screening of compounds (chemicals) is an
essential part of drug discovery, involving thousands to
millions of compounds, with the purpose of identifying
candidate hits. Most statistical tools, including the
industry standard B-score method, work on individual
compound plates and do not exploit cross-plate correlation
or statistical strength among plates. We present a new
statistical framework for high-throughput screening of
compounds based on Bayesian nonparametric modeling. The
proposed approach is able to identify candidate hits from
multiple plates simultaneously, sharing statistical strength
among plates and providing more robust estimates of compound
activity. It can flexibly accommodate arbitrary
distributions of compound activities and is applicable to
any plate geometry. The algorithm provides a principled
statistical approach for hit identification and false
discovery rate control. Experiments demonstrate significant
improvements in hit identification sensitivity and
specificity over the B-score and R-score methods, which are
highly sensitive to threshold choice. These improvements are
maintained at low hit rates. The framework is implemented as
an efficient R extension package BHTSpack and is suitable
for large scale data sets.},
Doi = {10.1038/s41598-018-27531-w},
Key = {fds335794}
}
@article{fds339305,
Author = {Guhaniyogi, R and Qamar, S and Dunson, DB},
Title = {Bayesian Conditional Density Filtering},
Journal = {Journal of Computational and Graphical Statistics},
Volume = {27},
Number = {3},
Pages = {657-672},
Publisher = {Informa UK Limited},
Year = {2018},
Month = {July},
url = {http://dx.doi.org/10.1080/10618600.2017.1422431},
Abstract = {We propose a conditional density filtering (C-DF) algorithm
for efficient online Bayesian inference. C-DF adapts MCMC
sampling to the online setting, sampling from approximations
to conditional posterior distributions obtained by
propagating surrogate conditional sufficient statistics (a
function of data and parameter estimates) as new data
arrive. These quantities eliminate the need to store or
process the entire dataset simultaneously and offer a number
of desirable features. Often, these include a reduction in
memory requirements and runtime and improved mixing, along
with state-of-the-art parameter inference and prediction.
These improvements are demonstrated through several
illustrative examples including an application to high
dimensional compressed regression. In the cases where
dimension of the model parameter does not grow with time, we
also establish sufficient conditions under which C-DF
samples converge to the target posterior distribution
asymptotically as sampling proceeds and more data arrive.
Supplementary materials of C-DF are available
online.},
Doi = {10.1080/10618600.2017.1422431},
Key = {fds339305}
}
@article{fds339365,
Author = {van den Boom, W and Mao, C and Schroeder, RA and Dunson,
DB},
Title = {Extrema-weighted feature extraction for functional
data.},
Journal = {Bioinformatics},
Volume = {34},
Number = {14},
Pages = {2457-2464},
Year = {2018},
Month = {July},
url = {http://dx.doi.org/10.1093/bioinformatics/bty120},
Abstract = {MOTIVATION: Although there is a rich literature on methods
for assessing the impact of functional predictors, the focus
has been on approaches for dimension reduction that do not
suit certain applications. Examples of standard approaches
include functional linear models, functional principal
components regression and cluster-based approaches, such as
latent trajectory analysis. This article is motivated by
applications in which the dynamics in a predictor, across
times when the value is relatively extreme, are particularly
informative about the response. For example, physicians are
interested in relating the dynamics of blood pressure
changes during surgery to post-surgery adverse outcomes, and
it is thought that the dynamics are more important when
blood pressure is significantly elevated or lowered.
RESULTS: We propose a novel class of extrema-weighted
feature (XWF) extraction models. Key components in defining
XWFs include the marginal density of the predictor, a
function up-weighting values at extreme quantiles of this
marginal, and functionals characterizing local dynamics.
Algorithms are proposed for fitting of XWF-based regression
and classification models, and are compared with current
methods for functional predictors in simulations and a blood
pressure during surgery application. XWFs find features of
intraoperative blood pressure trajectories that are
predictive of postoperative mortality. By their nature, most
of these features cannot be found by previous methods.
AVAILABILITY AND IMPLEMENTATION: The R package 'xwf' is
available at the CRAN repository: https://cran.r-project.org/package=xwf.
SUPPLEMENTARY INFORMATION: Supplementary data are available
at Bioinformatics online.},
Doi = {10.1093/bioinformatics/bty120},
Key = {fds339365}
}
@article{fds338057,
Author = {Srivastava, S and Li, C and Dunson, DB},
Title = {Scalable Bayes via barycenter in Wasserstein
space},
Journal = {Journal of Machine Learning Research},
Volume = {19},
Pages = {1-35},
Year = {2018},
Month = {August},
Abstract = {Divide-and-conquer based methods for Bayesian inference
provide a general approach for tractable posterior inference
when the sample size is large. These methods divide the data
into smaller subsets, sample from the posterior distribution
of parameters in parallel on all the subsets, and combine
posterior samples from all the subsets to approximate the
full data posterior distribution. The smaller size of any
subset compared to the full data implies that posterior
sampling on any subset is computationally more efficient
than sampling from the true posterior distribution. Since
the combination step takes negligible time relative to
sampling, posterior computations can be scaled to massive
data by dividing the full data into sufficiently large
number of data subsets. One such approach relies on the
geometry of posterior distributions estimated across
different subsets and combines them through their barycenter
in a Wasserstein space of probability measures. We provide
theoretical guarantees on the accuracy of approximation that
are valid in many applications. We show that the geometric
method approximates the full data posterior distribution
better than its competitors across diverse simulations and
reproduces known results when applied to a movie ratings
database.},
Key = {fds338057}
}
@article{fds340499,
Author = {Duan, LL and Johndrow, JE and Dunson, DB},
Title = {Scaling up data augmentation MCMC via calibration},
Journal = {Journal of Machine Learning Research},
Volume = {19},
Year = {2018},
Month = {October},
Abstract = {There has been considerable interest in making Bayesian
inference more scalable. In big data settings, most of the
focus has been on reducing the computing time per iteration
rather than reducing the number of iterations needed in
Markov chain Monte Carlo (MCMC). This article considers data
augmentation MCMC (DA-MCMC), a widely used technique.
DA-MCMC samples tend to become highly autocorrelated in
large samples, due to a mis-calibration problem in which
conditional posterior distributions given augmented data are
too concentrated. This makes it necessary to collect very
long MCMC paths to obtain acceptably low MC error. To combat
this inefficiency, we propose a family of calibrated data
augmentation algorithms, which appropriately adjust the
variance of conditional posterior distributions. A
Metropolis-Hastings step is used to eliminate bias in the
stationary distribution of the resulting sampler. Compared
to existing alternatives, this approach can dramatically
reduce MC error by reducing autocorrelation and increasing
the effective number of DA-MCMC samples per unit of
computing time. The approach is simple and applicable to a
broad variety of existing data augmentation algorithms. We
focus on three popular generalized linear models: probit,
logistic and Poisson log-linear. Dramatic gains in
computational efficiency are shown in applications.},
Key = {fds340499}
}
@article{fds335793,
Author = {Sarkar, A and Chabout, J and Macopson, JJ and Jarvis, ED and Dunson,
DB},
Title = {Bayesian Semiparametric Mixed Effects Markov Models With
Application to Vocalization Syntax},
Journal = {Journal of the American Statistical Association},
Volume = {113},
Number = {524},
Pages = {1515-1527},
Publisher = {Informa UK Limited},
Year = {2018},
Month = {October},
url = {http://dx.doi.org/10.1080/01621459.2018.1423986},
Abstract = {Studying the neurological, genetic, and evolutionary basis
of human vocal communication mechanisms using animal
vocalization models is an important field of neuroscience.
The datasets typically comprise structured sequences of
syllables or “songs” produced by animals from different
genotypes under different social contexts. It has been
difficult to come up with sophisticated statistical methods
that appropriately model animal vocal communication syntax.
We address this need by developing a novel Bayesian
semiparametric framework for inference in such datasets. Our
approach is built on a novel class of mixed effects Markov
transition models for the songs that accommodate exogenous
influences of genotype and context as well as
animal-specific heterogeneity. Crucial advantages of the
proposed approach include its ability to provide insights
into key scientific queries related to global and local
influences of the exogenous predictors on the transition
dynamics via automated tests of hypotheses. The methodology
is illustrated using simulation experiments and the
aforementioned motivating application in neuroscience.
Supplementary materials for this article, including a
standardized description of the materials available for
reproducing the work, are available as an online
supplement.},
Doi = {10.1080/01621459.2018.1423986},
Key = {fds335793}
}
@article{fds341344,
Author = {Canale, A and Durante, D and Dunson, DB},
Title = {Convex mixture regression for quantitative risk
assessment.},
Journal = {Biometrics},
Volume = {74},
Number = {4},
Pages = {1331-1340},
Year = {2018},
Month = {December},
url = {http://dx.doi.org/10.1111/biom.12917},
Abstract = {There is wide interest in studying how the distribution of a
continuous response changes with a predictor. We are
motivated by environmental applications in which the
predictor is the dose of an exposure and the response is a
health outcome. A main focus in these studies is inference
on dose levels associated with a given increase in risk
relative to a baseline. In addressing this goal, popular
methods either dichotomize the continuous response or focus
on modeling changes with the dose in the expectation of the
outcome. Such choices may lead to information loss and
provide inaccurate inference on dose-response relationships.
We instead propose a Bayesian convex mixture regression
model that allows the entire distribution of the health
outcome to be unknown and changing with the dose. To balance
flexibility and parsimony, we rely on a mixture model for
the density at the extreme doses, and express the
conditional density at each intermediate dose via a convex
combination of these extremal densities. This representation
generalizes classical dose-response models for quantitative
outcomes, and provides a more parsimonious, but still
powerful, formulation compared to nonparametric methods,
thereby improving interpretability and efficiency in
inference on risk functions. A Markov chain Monte Carlo
algorithm for posterior inference is developed, and the
benefits of our methods are outlined in simulations, along
with a study on the impact of dde exposure on gestational
age.},
Doi = {10.1111/biom.12917},
Key = {fds341344}
}
@article{fds348078,
Author = {Badea, A and Wu, W and Shuff, J and Wang, M and Anderson, RJ and Qi, Y and Johnson, GA and Wilson, JG and Koudoro, S and Garyfallidis, E and Colton, CA and Dunson, DB},
Title = {Identifying Vulnerable Brain Networks in Mouse Models of
Genetic Risk Factors for Late Onset Alzheimer's
Disease.},
Journal = {Front Neuroinform},
Volume = {13},
Pages = {72},
Year = {2019},
url = {http://dx.doi.org/10.3389/fninf.2019.00072},
Abstract = {The major genetic risk for late onset Alzheimer's disease
has been associated with the presence of APOE4 alleles.
However, the impact of different APOE alleles on the brain
aging trajectory, and how they interact with the brain local
environment in a sex specific manner is not entirely clear.
We sought to identify vulnerable brain circuits in novel
mouse models with homozygous targeted replacement of the
mouse ApoE gene with either human APOE3 or APOE4 gene
alleles. These genes are expressed in mice that also model
the human immune response to age and disease-associated
challenges by expressing the human NOS2 gene in place of the
mouse mNos2 gene. These mice had impaired learning and
memory when assessed with the Morris water maze (MWM) and
novel object recognition (NOR) tests. Ex vivo MRI-DTI
analyses revealed global and local atrophy, and areas of
reduced fractional anisotropy (FA). Using tensor network
principal component analyses for structural connectomes, we
inferred the pairwise connections which best separate APOE4
from APOE3 carriers. These involved primarily
interhemispheric connections among regions of olfactory
areas, the hippocampus, and the cerebellum. Our results also
suggest that pairwise connections may be subdivided and
clustered spatially to reveal local changes on a finer
scale. These analyses revealed not just genotype, but also
sex specific differences. Identifying vulnerable networks
may provide targets for interventions, and a means to
stratify patients.},
Doi = {10.3389/fninf.2019.00072},
Key = {fds348078}
}
@article{fds342829,
Author = {Zhang, Z and Descoteaux, M and Dunson, DB},
Title = {Nonparametric Bayes Models of Fiber Curves Connecting Brain
Regions.},
Journal = {Journal of the American Statistical Association},
Volume = {114},
Number = {528},
Pages = {1505-1517},
Year = {2019},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2019.1574582},
Abstract = {In studying structural inter-connections in the human brain,
it is common to first estimate fiber bundles connecting
different regions relying on diffusion MRI. These fiber
bundles act as highways for neural activity. Current
statistical methods reduce the rich information into an
adjacency matrix, with the elements containing a count of
fibers or a mean diffusion feature along the fibers. The
goal of this article is to avoid discarding the rich
geometric information of fibers, developing flexible models
for characterizing the population distribution of fibers
between brain regions of interest within and across
different individuals. We start by decomposing each fiber
into a rotation matrix, shape and translation from a global
reference curve. These components are viewed as data lying
on a product space composed of different Euclidean spaces
and manifolds. To nonparametrically model the distribution
within and across individuals, we rely on a hierarchical
mixture of product kernels specific to the component spaces.
Taking a Bayesian approach to inference, we develop
efficient methods for posterior sampling. The approach
automatically produces clusters of fibers within and across
individuals. Applying the method to Human Connectome Project
data, we find interesting relationships between brain fiber
geometry and reading ability. Supplementary materials for
this article, including a standardized description of the
materials available for reproducing the work, are available
as an online supplement.},
Doi = {10.1080/01621459.2019.1574582},
Key = {fds342829}
}
@article{fds342828,
Author = {Wang, L and Zhengwu Zhang, and Dunson},
Title = {COMMON AND INDIVIDUAL STRUCTURE OF MULTIPLE
NETWORKS},
Volume = {13},
Number = {1},
Pages = {85-112},
Year = {2019},
Month = {January},
url = {http://dx.doi.org/10.1214/18-AOAS1193},
Abstract = {This article focuses on the problem of studying shared- and
individualspecific structure in replicated networks or
graph-valued data. In particular, the observed data consist
of n graphs, G i , i = 1, . . ., n, with each graph
consisting of a collection of edges between V nodes. In
brain connectomics, the graph for an individual corresponds
to a set of interconnections among brain regions. Such data
can be organized as a V × V binary adjacency matrix Ai for
each i, with ones indicating an edge between a pair of nodes
and zeros indicating no edge. When nodes have a shared
meaning across replicates i = 1, . . ., n, it becomes of
substantial interest to study similarities and differences
in the adjacency matrices. To address this problem, we
propose a method to estimate a common structure and
low-dimensional individualspecific deviations from
replicated networks. The proposed Multiple GRAph
Factorization (M-GRAF) model relies on a logistic regression
mapping combined with a hierarchical eigenvalue
decomposition. We develop an efficient algorithm for
estimation and study basic properties of our approach.
Simulation studies show excellent operating characteristics
and we apply the method to human brain connectomics
data.},
Doi = {10.1214/18-AOAS1193},
Key = {fds342828}
}
@article{fds337687,
Author = {Miller, JW and Dunson, DB},
Title = {Robust Bayesian inference via coarsening.},
Journal = {Journal of the American Statistical Association},
Volume = {114},
Number = {527},
Pages = {1113-1125},
Publisher = {Informa UK Limited},
Year = {2019},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2018.1469995},
Abstract = {The standard approach to Bayesian inference is based on the
assumption that the distribution of the data belongs to the
chosen model class. However, even a small violation of this
assumption can have a large impact on the outcome of a
Bayesian procedure. We introduce a novel approach to
Bayesian inference that improves robustness to small
departures from the model: rather than conditioning on the
event that the observed data are generated by the model, one
conditions on the event that the model generates data close
to the observed data, in a distributional sense. When
closeness is defined in terms of relative entropy, the
resulting "coarsened" posterior can be approximated by
simply tempering the likelihood-that is, by raising the
likelihood to a fractional power-thus, inference can usually
be implemented via standard algorithms, and one can even
obtain analytical solutions when using conjugate priors.
Some theoretical properties are derived, and we illustrate
the approach with real and simulated data using mixture
models and autoregressive models of unknown
order.},
Doi = {10.1080/01621459.2018.1469995},
Key = {fds337687}
}
@article{fds345691,
Author = {Lin, L and Mu, N and Cheung, P and Dunson, D},
Title = {Extrinsic Gaussian processes for regression and
classification on manifolds},
Journal = {Bayesian Analysis},
Volume = {14},
Number = {3},
Pages = {887-906},
Year = {2019},
Month = {January},
url = {http://dx.doi.org/10.1214/18-BA1135},
Abstract = {Gaussian processes (GPs) are very widely used for modeling
of unknown functions or surfaces in applications ranging
from regression to classification to spatial processes.
Although there is an increasingly vast literature on
applications, methods, theory and algorithms related to GPs,
the overwhelming majority of this literature focuses on the
case in which the input domain corresponds to a Euclidean
space. However, particularly in recent years with the
increasing collection of complex data, it is commonly the
case that the input domain does not have such a simple form.
For example, it is common for the inputs to be restricted to
a non-Euclidean manifold, a case which forms the motivation
for this article. In particular, we propose a general
extrinsic framework for GP modeling on manifolds, which
relies on embedding of the manifold into a Euclidean space
and then constructing extrinsic kernels for GPs on their
images. These extrinsic Gaussian processes (eGPs) are used
as prior distributions for unknown functions in Bayesian
inferences. Our approach is simple and general, and we show
that the eGPs inherit fine theoretical properties from GP
models in Euclidean spaces. We consider applications of our
models to regression and classification problems with
predictors lying in a large class of manifolds, including
spheres, planar shape spaces, a space of positive definite
matrices, and Grassmannians. Our models can be readily used
by practitioners in biological sciences for various
regression and classification problems, such as disease
diagnosis or detection. Our work is also likely to have
impact in spatial statistics when spatial locations are on
the sphere or other geometric spaces.},
Doi = {10.1214/18-BA1135},
Key = {fds345691}
}
@article{fds371715,
Author = {Dunson, D and Wood, S},
Title = {Report of the Editors—2018},
Journal = {Journal of the Royal Statistical Society. Series B:
Statistical Methodology},
Volume = {81},
Number = {1},
Pages = {3-4},
Year = {2019},
Month = {January},
url = {http://dx.doi.org/10.1111/RSSB.12306},
Doi = {10.1111/RSSB.12306},
Key = {fds371715}
}
@article{fds342197,
Author = {Wang, L and Zhang, Z and Dunson, D},
Title = {Symmetric Bilinear Regression for Signal Subgraph
Estimation.},
Journal = {IEEE transactions on signal processing : a publication of
the IEEE Signal Processing Society},
Volume = {67},
Number = {7},
Pages = {1929-1940},
Year = {2019},
Month = {April},
url = {http://dx.doi.org/10.1109/tsp.2019.2899818},
Abstract = {There is an increasing interest in learning a set of small
outcome-relevant subgraphs in network-predictor regression.
The extracted signal subgraphs can greatly improve the
interpretation of the association between the network
predictor and the response. In brain connectomics, the brain
network for an individual corresponds to a set of
interconnections among brain regions and there is a strong
interest in linking the brain connectome to human cognitive
traits. Modern neuroimaging technology allows a very fine
segmentation of the brain, producing very large structural
brain networks. Therefore, accurate and efficient methods
for identifying a set of small predictive subgraphs become
crucial, leading to discovery of key interconnected brain
regions related to the trait and important insights on the
mechanism of variation in human cognitive traits. We propose
a symmetric bilinear model with <i>L</i><sub>1</sub> penalty
to search for small clique subgraphs that contain useful
information about the response. A coordinate descent
algorithm is developed to estimate the model where we derive
analytical solutions for a sequence of conditional convex
optimizations. Application of this method on human
connectome and language comprehension data shows interesting
discovery of relevant interconnections among several small
sets of brain regions and better predictive performance than
competitors.},
Doi = {10.1109/tsp.2019.2899818},
Key = {fds342197}
}
@article{fds342830,
Author = {Niu, M and Cheung, P and Lin, L and Dai, Z and Lawrence, N and Dunson,
D},
Title = {Intrinsic Gaussian processes on complex constrained
domains},
Journal = {Journal of the Royal Statistical Society. Series B:
Statistical Methodology},
Volume = {81},
Number = {3},
Pages = {603-627},
Year = {2019},
Month = {July},
url = {http://dx.doi.org/10.1111/rssb.12320},
Abstract = {We propose a class of intrinsic Gaussian processes (GPs) for
interpolation, regression and classification on manifolds
with a primary focus on complex constrained domains or
irregularly shaped spaces arising as subsets or submanifolds
of R, R2, R3 and beyond. For example, intrinsic GPs can
accommodate spatial domains arising as complex subsets of
Euclidean space. Intrinsic GPs respect the potentially
complex boundary or interior conditions as well as the
intrinsic geometry of the spaces. The key novelty of the
approach proposed is to utilize the relationship between
heat kernels and the transition density of Brownian motion
on manifolds for constructing and approximating valid and
computationally feasible covariance kernels. This enables
intrinsic GPs to be practically applied in great generality,
whereas existing approaches for smoothing on constrained
domains are limited to simple special cases. The broad
utilities of the intrinsic GP approach are illustrated
through simulation studies and data examples.},
Doi = {10.1111/rssb.12320},
Key = {fds342830}
}
@article{fds340936,
Author = {Johndrow, JE and Smith, A and Pillai, N and Dunson,
DB},
Title = {MCMC for Imbalanced Categorical Data},
Journal = {Journal of the American Statistical Association},
Volume = {114},
Number = {527},
Pages = {1394-1403},
Year = {2019},
Month = {July},
url = {http://dx.doi.org/10.1080/01621459.2018.1505626},
Abstract = {Many modern applications collect highly imbalanced
categorical data, with some categories relatively rare.
Bayesian hierarchical models combat data sparsity by
borrowing information, while also quantifying uncertainty.
However, posterior computation presents a fundamental
barrier to routine use; a single class of algorithms does
not work well in all settings and practitioners waste time
trying different types of Markov chain Monte Carlo (MCMC)
approaches. This article was motivated by an application to
quantitative advertising in which we encountered extremely
poor computational performance for data augmentation MCMC
algorithms but obtained excellent performance for adaptive
Metropolis. To obtain a deeper understanding of this
behavior, we derive theoretical results on the computational
complexity of commonly used data augmentation algorithms and
the Random Walk Metropolis algorithm for highly imbalanced
binary data. In this regime, our results show computational
complexity of Metropolis is logarithmic in sample size,
while data augmentation is polynomial in sample size. The
root cause of this poor performance of data augmentation is
a discrepancy between the rates at which the target density
and MCMC step sizes concentrate. Our methods also show that
MCMC algorithms that exhibit a similar discrepancy will fail
in large samples—a result with substantial practical
impact. Supplementary materials for this article are
available online.},
Doi = {10.1080/01621459.2018.1505626},
Key = {fds340936}
}
@article{fds342827,
Author = {Zhang, Z and Allen, GI and Zhu, H and Dunson, D},
Title = {Tensor network factorizations: Relationships between brain
structural connectomes and traits.},
Journal = {NeuroImage},
Volume = {197},
Pages = {330-343},
Year = {2019},
Month = {August},
url = {http://dx.doi.org/10.1016/j.neuroimage.2019.04.027},
Abstract = {Advanced brain imaging techniques make it possible to
measure individuals' structural connectomes in large cohort
studies non-invasively. Given the availability of large
scale data sets, it is extremely interesting and important
to build a set of advanced tools for structural connectome
extraction and statistical analysis that emphasize both
interpretability and predictive power. In this paper, we
developed and integrated a set of toolboxes, including an
advanced structural connectome extraction pipeline and a
novel tensor network principal components analysis (TN-PCA)
method, to study relationships between structural
connectomes and various human traits such as alcohol and
drug use, cognition and motion abilities. The structural
connectome extraction pipeline produces a set of connectome
features for each subject that can be organized as a tensor
network, and TN-PCA maps the high-dimensional tensor network
data to a lower-dimensional Euclidean space. Combined with
classical hypothesis testing, canonical correlation analysis
and linear discriminant analysis techniques, we analyzed
over 1100 scans of 1076 subjects from the Human Connectome
Project (HCP) and the Sherbrooke test-retest data set, as
well as 175 human traits measuring different domains
including cognition, substance use, motor, sensory and
emotion. The test-retest data validated the developed
algorithms. With the HCP data, we found that structural
connectomes are associated with a wide range of traits,
e.g., fluid intelligence, language comprehension, and motor
skills are associated with increased cortical-cortical brain
structural connectivity, while the use of alcohol, tobacco,
and marijuana are associated with decreased
cortical-cortical connectivity. We also demonstrated that
our extracted structural connectomes and analysis method can
give superior prediction accuracies compared with
alternative connectome constructions and other tensor and
network regression methods.},
Doi = {10.1016/j.neuroimage.2019.04.027},
Key = {fds342827}
}
@article{fds343735,
Author = {Norberg, A and Abrego, N and Blanchet, FG and Adler, FR and Anderson,
BJ and Anttila, J and Araújo, MB and Dallas, T and Dunson, D and Elith, J and Foster, SD and Fox, R and Franklin, J and Godsoe, W and Guisan, A and O'Hara, B and Hill, NA and Holt, RD and Hui, FKC and Husby, M and Kålås,
JA and Lehikoinen, A and Luoto, M and Mod, HK and Newell, G and Renner, I and Roslin, T and Soininen, J and Thuiller, W and Vanhatalo, J and Warton,
D and White, M and Zimmermann, NE and Gravel, D and Ovaskainen,
O},
Title = {A comprehensive evaluation of predictive performance of 33
species distribution models at species and community
levels},
Journal = {Ecological Monographs},
Volume = {89},
Number = {3},
Year = {2019},
Month = {August},
url = {http://dx.doi.org/10.1002/ecm.1370},
Abstract = {A large array of species distribution model (SDM) approaches
has been developed for explaining and predicting the
occurrences of individual species or species assemblages.
Given the wealth of existing models, it is unclear which
models perform best for interpolation or extrapolation of
existing data sets, particularly when one is concerned with
species assemblages. We compared the predictive performance
of 33 variants of 15 widely applied and recently emerged
SDMs in the context of multispecies data, including both
joint SDMs that model multiple species together, and stacked
SDMs that model each species individually combining the
predictions afterward. We offer a comprehensive evaluation
of these SDM approaches by examining their performance in
predicting withheld empirical validation data of different
sizes representing five different taxonomic groups, and for
prediction tasks related to both interpolation and
extrapolation. We measure predictive performance by 12
measures of accuracy, discrimination power, calibration, and
precision of predictions, for the biological levels of
species occurrence, species richness, and community
composition. Our results show large variation among the
models in their predictive performance, especially for
communities comprising many species that are rare. The
results do not reveal any major trade-offs among measures of
model performance; the same models performed generally well
in terms of accuracy, discrimination, and calibration, and
for the biological levels of individual species, species
richness, and community composition. In contrast, the models
that gave the most precise predictions were not well
calibrated, suggesting that poorly performing models can
make overconfident predictions. However, none of the models
performed well for all prediction tasks. As a general
strategy, we therefore propose that researchers fit a small
set of models showing complementary performance, and then
apply a cross-validation procedure involving separate data
to establish which of these models performs best for the
goal of the study.},
Doi = {10.1002/ecm.1370},
Key = {fds343735}
}
@article{fds344442,
Author = {Li, C and Lin, L and Dunson, DB},
Title = {On posterior consistency of tail index for Bayesian kernel
mixture models},
Journal = {Bernoulli},
Volume = {25},
Number = {3},
Pages = {1999-2028},
Publisher = {Bernoulli Society for Mathematical Statistics and
Probability},
Year = {2019},
Month = {August},
url = {http://dx.doi.org/10.3150/18-bej1043},
Doi = {10.3150/18-bej1043},
Key = {fds344442}
}
@article{fds346411,
Author = {Chae, M and Lin, L and Dunson, DB},
Title = {Bayesian sparse linear regression with unknown symmetric
error},
Journal = {Information and Inference},
Volume = {8},
Number = {3},
Pages = {621-653},
Year = {2019},
Month = {September},
url = {http://dx.doi.org/10.1093/imaiai/iay022},
Abstract = {We study Bayesian procedures for sparse linear regression
when the unknown error distribution is endowed with a
non-parametric prior. Specifically, we put a symmetrized
Dirichlet process mixture of Gaussian prior on the error
density, where the mixing distributions are compactly
supported. For the prior on regression coefficients, a
mixture of point masses at zero and continuous distributions
is considered. Under the assumption that the model is well
specified, we study behavior of the posterior with diverging
number of predictors. The compatibility and restricted
eigenvalue conditions yield the minimax convergence rate of
the regression coefficients in 1- and 2-norms, respectively.
In addition, strong model selection consistency and a
semi-parametric Bernstein-von Mises theorem are proven under
slightly stronger conditions.},
Doi = {10.1093/imaiai/iay022},
Key = {fds346411}
}
@article{fds349871,
Author = {Panea, RI and Love, CL and Shingleton, JR and Reddy, A and Bailey, JA and Moormann, AM and Otieno, JA and Ong'echa, JM and Oduor, CI and Schroeder, KMS and Masalu, N and Chao, NJ and Agajanian, M and Major,
MB and Fedoriw, Y and Richards, KL and Rymkiewicz, G and Miles, RR and Alobeid, B and Bhagat, G and Flowers, CR and Ondrejka, SL and Hsi, ED and Choi, WWL and Au-Yeung, RKH and Hartmann, W and Lenz, G and Meyerson, H and Lin, Y-Y and Zhuang, Y and Luftig, MA and Waldrop, A and Dave, T and Thakkar, D and Sahay, H and Li, G and Palus, BC and Seshadri, V and Kim,
SY and Gascoyne, RD and Levy, S and Mukhopadyay, M and Dunson, DB and Dave,
SS},
Title = {The whole-genome landscape of Burkitt lymphoma
subtypes.},
Journal = {Blood},
Volume = {134},
Number = {19},
Pages = {1598-1607},
Year = {2019},
Month = {November},
url = {http://dx.doi.org/10.1182/blood.2019001880},
Abstract = {Burkitt lymphoma (BL) is an aggressive, MYC-driven lymphoma
comprising 3 distinct clinical subtypes: sporadic BLs that
occur worldwide, endemic BLs that occur predominantly in
sub-Saharan Africa, and immunodeficiency-associated BLs that
occur primarily in the setting of HIV. In this study, we
comprehensively delineated the genomic basis of BL through
whole-genome sequencing (WGS) of 101 tumors representing all
3 subtypes of BL to identify 72 driver genes. These data
were additionally informed by CRISPR screens in BL cell
lines to functionally annotate the role of oncogenic
drivers. Nearly every driver gene was found to have both
coding and non-coding mutations, highlighting the importance
of WGS for identifying driver events. Our data implicate
coding and non-coding mutations in IGLL5, BACH2, SIN3A, and
DNMT1. Epstein-Barr virus (EBV) infection was associated
with higher mutation load, with type 1 EBV showing a higher
mutational burden than type 2 EBV. Although sporadic and
immunodeficiency-associated BLs had similar genetic
profiles, endemic BLs manifested more frequent mutations in
BCL7A and BCL6 and fewer genetic alterations in DNMT1,
SNTB2, and CTCF. Silencing mutations in ID3 were a common
feature of all 3 subtypes of BL. In vitro, mass
spectrometry-based proteomics demonstrated that the ID3
protein binds primarily to TCF3 and TCF4. In vivo knockout
of ID3 potentiated the effects of MYC, leading to rapid
tumorigenesis and tumor phenotypes consistent with those
observed in the human disease.},
Doi = {10.1182/blood.2019001880},
Key = {fds349871}
}
@article{fds348797,
Author = {Thai, DH and Wu, HT and Dunson, DB},
Title = {Locally convex kernel mixtures: Bayesian subspace
learning},
Journal = {Proceedings - 18th IEEE International Conference on Machine
Learning and Applications, ICMLA 2019},
Pages = {272-275},
Year = {2019},
Month = {December},
ISBN = {9781728145495},
url = {http://dx.doi.org/10.1109/ICMLA.2019.00051},
Abstract = {Kernel mixture models are routinely used for density
estimation. However, in multivariate settings, issues arise
in efficiently approximating lower-dimensional structure in
the data. For example, it is common to suppose that the
density is concentrated near a lower-dimensional non-linear
subspace or manifold. Typical kernels used to locally
approximate such subspaces are inflexible, so that a large
number of components are often needed. We propose a novel
class of LOcally COnvex (LOCO) kernels that are flexible in
adapting to nonlinear local structure. LOCO kernels are
induced by introducing random knots within local
neighborhoods, and generating data as a random convex
combination of these knots with adaptive weights and an
additive noise. For identifiability, we constrain all
observations from a particular component to have the same
mean. For Bayesian inference subject to this constraint, we
develop a hybrid Gibbs sampler and optimization algorithm
that incorporates a Lagrange multiplier within a splitting
method. The resulting LOCO algorithm is shown to
dramatically outperform typical Gaussian mixture models in
challenging examples.},
Doi = {10.1109/ICMLA.2019.00051},
Key = {fds348797}
}
@article{fds348920,
Author = {Camerlenghi, F and Dunson, DB and Lijoi, A and Prünster, I and Rodríguez, A},
Title = {Latent Nested Nonparametric Priors (with
Discussion).},
Journal = {Bayesian analysis},
Volume = {14},
Number = {4},
Pages = {1303-1356},
Publisher = {Institute of Mathematical Statistics},
Year = {2019},
Month = {December},
url = {http://dx.doi.org/10.1214/19-ba1169},
Abstract = {Discrete random structures are important tools in Bayesian
nonparametrics and the resulting models have proven
effective in density estimation, clustering, topic modeling
and prediction, among others. In this paper, we consider
nested processes and study the dependence structures they
induce. Dependence ranges between homogeneity, corresponding
to full exchangeability, and maximum heterogeneity,
corresponding to (unconditional) independence across
samples. The popular nested Dirichlet process is shown to
degenerate to the fully exchangeable case when there are
ties across samples at the observed or latent level. To
overcome this drawback, inherent to nesting general discrete
random measures, we introduce a novel class of latent nested
processes. These are obtained by adding common and
group-specific completely random measures and, then,
normalizing to yield dependent random probability measures.
We provide results on the partition distributions induced by
latent nested processes, and develop a Markov Chain Monte
Carlo sampler for Bayesian inferences. A test for
distributional homogeneity across groups is obtained as a
by-product. The results and their inferential implications
are showcased on synthetic and real data.},
Doi = {10.1214/19-ba1169},
Key = {fds348920}
}
@article{fds354254,
Author = {Talbot, A and Dunson, D and Dzirasa, K and Carlson,
D},
Title = {Supervised Autoencoders Learn Robust Joint Factor Models of
Neural Activity},
Journal = {arXiv preprint arXiv:2004.05209},
Volume = {abs/2004.05209},
Year = {2020},
Key = {fds354254}
}
@article{fds344775,
Author = {Li, M and Dunson, DB},
Title = {Comparing and weighting imperfect models using
D-probabilities.},
Journal = {Journal of the American Statistical Association},
Volume = {115},
Number = {531},
Pages = {1349-1360},
Year = {2020},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2019.1611140},
Abstract = {We propose a new approach for assigning weights to models
using a divergence-based method (<i>D-probabilities</i>),
relying on evaluating parametric models relative to a
nonparametric Bayesian reference using Kullback-Leibler
divergence. D-probabilities are useful in goodness-of-fit
assessments, in comparing imperfect models, and in providing
model weights to be used in model aggregation.
D-probabilities avoid some of the disadvantages of Bayesian
model probabilities, such as large sensitivity to prior
choice, and tend to place higher weight on a greater
diversity of models. In an application to linear model
selection against a Gaussian process reference, we provide
simple analytic forms for routine implementation and show
that D-probabilities automatically penalize model
complexity. Some asymptotic properties are described, and we
provide interesting probabilistic interpretations of the
proposed model weights. The framework is illustrated through
simulation examples and an ozone data application.},
Doi = {10.1080/01621459.2019.1611140},
Key = {fds344775}
}
@article{fds347354,
Author = {Mukhopadhyay, M and Dunson, DB},
Title = {Targeted Random Projection for Prediction From
High-Dimensional Features},
Journal = {Journal of the American Statistical Association},
Volume = {115},
Number = {532},
Pages = {1998-2010},
Year = {2020},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2019.1677240},
Abstract = {We consider the problem of computationally efficient
prediction with high dimensional and highly correlated
predictors when accurate variable selection is effectively
impossible. Direct application of penalization or Bayesian
methods implemented with Markov chain Monte Carlo can be
computationally daunting and unstable. A common solution is
first stage dimension reduction through screening or
projecting the design matrix to a lower dimensional
hyper-plane. Screening is highly sensitive to threshold
choice, while projections often have poor performance in
very high-dimensions. We propose targeted random projection
(TARP) to combine positive aspects of both strategies. TARP
uses screening to order the inclusion probabilities of the
features in the projection matrix used for dimension
reduction, leading to data-informed sparsity. We provide
theoretical support for a Bayesian predictive algorithm
based on TARP, including statistical and computational
complexity guarantees. Examples for simulated and real data
applications illustrate gains relative to a variety of
competitors. Supplementary materials for this article are
available online.},
Doi = {10.1080/01621459.2019.1677240},
Key = {fds347354}
}
@article{fds349191,
Author = {Jauch, M and Hoff, PD and Dunson, DB},
Title = {Random orthogonal matrices and the Cayley
transform},
Journal = {Bernoulli},
Volume = {26},
Number = {2},
Pages = {1560-1586},
Year = {2020},
Month = {January},
url = {http://dx.doi.org/10.3150/19-BEJ1176},
Abstract = {Random orthogonal matrices play an important role in
probability and statistics, arising in multivariate
analysis, directional statistics, and models of physical
systems, among other areas. Calculations involving random
orthogonal matrices are complicated by their constrained
support. Accordingly, we parametrize the Stiefel and
Grassmann manifolds, represented as subsets of orthogonal
matrices, in terms of Euclidean parameters using the Cayley
transform. We derive the necessary Jacobian terms for change
of variables formulas. Given a density defined on the
Stiefel or Grassmann manifold, these allow us to specify the
corresponding density for the Euclidean parameters, and vice
versa. As an application, we present a Markov chain Monte
Carlo approach to simulating from distributions on the
Stiefel and Grassmann manifolds. Finally, we establish that
the Euclidean parameters corresponding to a uniform
orthogonal matrix can be approximated asymptotically by
independent normals. This result contributes to the growing
literature on normal approximations to the entries of random
orthogonal matrices or transformations thereof.},
Doi = {10.3150/19-BEJ1176},
Key = {fds349191}
}
@article{fds354543,
Author = {Nishimura, A and Dunson, D},
Title = {Recycling Intermediate Steps to Improve Hamiltonian Monte
Carlo},
Journal = {Bayesian Analysis},
Volume = {15},
Number = {4},
Pages = {1087-1108},
Year = {2020},
Month = {January},
url = {http://dx.doi.org/10.1214/19-BA1171},
Abstract = {Hamiltonian Monte Carlo (HMC) and related algorithms have
become routinely used in Bayesian computation. In this
article, we present a simple and provably accurate method to
improve the efficiency of HMC and related algorithms with
essentially no extra computational cost. This is achieved by
recycling the intermediate states along simulated
trajectories of Hamiltonian dynamics. Standard algorithms
use only the end points of trajectories, wastefully
discarding all the intermediate states. Compared to the
alternative methods for utilizing the intermediate states,
our algorithm is simpler to apply in practice and requires
little programming effort beyond the usual implementations
of HMC and related algorithms. Our algorithm applies
straightforwardly to the no-U-turn sampler, arguably the
most popular variant of HMC. Through a variety of
experiments, we demonstrate that our recycling algorithm
yields substantial computational efficiency
gains.},
Doi = {10.1214/19-BA1171},
Key = {fds354543}
}
@article{fds356411,
Author = {Tam, E and Dunson, D},
Title = {Fiedler regularization: Learning neural networks with graph
sparsity},
Journal = {37th International Conference on Machine Learning, ICML
2020},
Volume = {PartF168147-12},
Pages = {9288-9297},
Year = {2020},
Month = {January},
ISBN = {9781713821120},
Abstract = {We introduce a novel regularization approach for deep
learning that incorporates and respects the underlying
graphical structure of the neural network. Existing
regularization methods often focus on penalizing weights in
a global/uniform manner that ignores the connectivity
structure of the neural network. We propose to use the
Fiedler value of the neural network's underlying graph as a
tool for regularization. We provide theoretical support for
this approach via spectral graph theory. We show several
useful properties of the Fiedler value that make it suitable
for regularization. We provide an approximate, variational
approach for faster computation during training. We provide
an alternative formulation of this framework in the form of
a structurally weighted L1 penalty, thus linking our
approach to sparsity induction. We performed experiments on
datasets that compare Fiedler regularization with
traditional regularization methods such as Dropout and
weight decay. Results demonstrate the efficacy of Fiedler
regularization.},
Key = {fds356411}
}
@article{fds348077,
Author = {Tikhonov, G and Duan, L and Abrego, N and Newell, G and White, M and Dunson, D and Ovaskainen, O},
Title = {Computationally efficient joint species distribution
modeling of big spatial data.},
Journal = {Ecology},
Volume = {101},
Number = {2},
Pages = {e02929},
Year = {2020},
Month = {February},
url = {http://dx.doi.org/10.1002/ecy.2929},
Abstract = {The ongoing global change and the increased interest in
macroecological processes call for the analysis of spatially
extensive data on species communities to understand and
forecast distributional changes of biodiversity. Recently
developed joint species distribution models can deal with
numerous species efficiently, while explicitly accounting
for spatial structure in the data. However, their
applicability is generally limited to relatively small
spatial data sets because of their severe computational
scaling as the number of spatial locations increases. In
this work, we propose a practical alleviation of this
scalability constraint for joint species modeling by
exploiting two spatial-statistics techniques that facilitate
the analysis of large spatial data sets: Gaussian predictive
process and nearest-neighbor Gaussian process. We devised an
efficient Gibbs posterior sampling algorithm for Bayesian
model fitting that allows us to analyze community data sets
consisting of hundreds of species sampled from up to
hundreds of thousands of spatial units. The performance of
these methods is demonstrated using an extensive plant data
set of 30,955 spatial units as a case study. We provide an
implementation of the presented methods as an extension to
the hierarchical modeling of species communities
framework.},
Doi = {10.1002/ecy.2929},
Key = {fds348077}
}
@article{fds348918,
Author = {Dunson, DB and Johndrow, JE},
Title = {The Hastings algorithm at fifty},
Journal = {Biometrika},
Volume = {107},
Number = {1},
Pages = {1-23},
Year = {2020},
Month = {March},
url = {http://dx.doi.org/10.1093/biomet/asz066},
Abstract = {In a 1970 Biometrika paper, W. K. Hastings developed a broad
class of Markov chain algorithms for sampling from
probability distributions that are difficult to sample from
directly. The algorithm draws a candidate value from a
proposal distribution and accepts the candidate with a
probability that can be computed using only the unnormalized
density of the target distribution, allowing one to sample
from distributions known only up to a constant of
proportionality. The stationary distribution of the
corresponding Markov chain is the target distribution one is
attempting to sample from. The Hastings algorithm
generalizes the Metropolis algorithm to allow a much broader
class of proposal distributions instead of just symmetric
cases.An important class of applications for the Hastings
algorithm corresponds to sampling from Bayesian posterior
distributions, which have densities given by a prior density
multiplied by a likelihood function and divided by a
normalizing constant equal to the marginal likelihood. The
marginal likelihood is typically intractable, presenting a
fundamental barrier to implementation in Bayesian
statistics. This barrier can be overcome by Markov chain
Monte Carlo sampling algorithms. Amazingly, even after 50
years, the majority of algorithms used in practice today
involve the Hastings algorithm. This article provides a
brief celebration of the continuing impact of this ingenious
algorithm on the 50th anniversary of its
publication.},
Doi = {10.1093/biomet/asz066},
Key = {fds348918}
}
@article{fds348919,
Author = {Duan, LL and Young, AL and Nishimura, A and Dunson,
DB},
Title = {Bayesian constraint relaxation.},
Journal = {Biometrika},
Volume = {107},
Number = {1},
Pages = {191-204},
Year = {2020},
Month = {March},
url = {http://dx.doi.org/10.1093/biomet/asz069},
Abstract = {Prior information often takes the form of parameter
constraints. Bayesian methods include such information
through prior distributions having constrained support. By
using posterior sampling algorithms, one can quantify
uncertainty without relying on asymptotic approximations.
However, sharply constrained priors are not necessary in
some settings and tend to limit modelling scope to a narrow
set of distributions that are tractable computationally. We
propose to replace the sharp indicator function of the
constraint with an exponential kernel, thereby creating a
close-to-constrained neighbourhood within the Euclidean
space in which the constrained subspace is embedded. This
kernel decays with distance from the constrained space at a
rate depending on a relaxation hyperparameter. By avoiding
the sharp constraint, we enable use of off-the-shelf
posterior sampling algorithms, such as Hamiltonian Monte
Carlo, facilitating automatic computation in a broad range
of models. We study the constrained and relaxed
distributions under multiple settings and theoretically
quantify their differences. Application of the method is
illustrated through several novel modelling
examples.},
Doi = {10.1093/biomet/asz069},
Key = {fds348919}
}
@article{fds350128,
Author = {Aliverti, E and Tilson, JL and Filer, DL and Babcock, B and Colaneri, A and Ocasio, J and Gershon, TR and Wilhelmsen, KC and Dunson,
DB},
Title = {Projected t-SNE for batch correction.},
Journal = {Bioinformatics (Oxford, England)},
Volume = {36},
Number = {11},
Pages = {3522-3527},
Year = {2020},
Month = {June},
url = {http://dx.doi.org/10.1093/bioinformatics/btaa189},
Abstract = {<h4>Motivation</h4>Low-dimensional representations of
high-dimensional data are routinely employed in biomedical
research to visualize, interpret and communicate results
from different pipelines. In this article, we propose a
novel procedure to directly estimate t-SNE embeddings that
are not driven by batch effects. Without correction,
interesting structure in the data can be obscured by batch
effects. The proposed algorithm can therefore significantly
aid visualization of high-dimensional data.<h4>Results</h4>The
proposed methods are based on linear algebra and constrained
optimization, leading to efficient algorithms and fast
computation in many high-dimensional settings. Results on
artificial single-cell transcription profiling data show
that the proposed procedure successfully removes multiple
batch effects from t-SNE embeddings, while retaining
fundamental information on cell types. When applied to
single-cell gene expression data to investigate mouse
medulloblastoma, the proposed method successfully removes
batches related with mice identifiers and the date of the
experiment, while preserving clusters of oligodendrocytes,
astrocytes, and endothelial cells and microglia, which are
expected to lie in the stroma within or adjacent to the
tumours.<h4>Availability and implementation</h4>Source code
implementing the proposed approach is available as an R
package at https://github.com/emanuelealiverti/BC_tSNE,
including a tutorial to reproduce the simulation
studies.<h4>Contact</h4>aliverti@stat.unipd.it.},
Doi = {10.1093/bioinformatics/btaa189},
Key = {fds350128}
}
@article{fds350537,
Author = {Nishimura, A and Dunson, DB and Lu, J},
Title = {Discontinuous Hamiltonian Monte Carlo for discrete
parameters and discontinuous likelihoods},
Journal = {Biometrika},
Volume = {107},
Number = {2},
Pages = {365-380},
Year = {2020},
Month = {June},
url = {http://dx.doi.org/10.1093/biomet/asz083},
Abstract = {Hamiltonian Monte Carlo has emerged as a standard tool for
posterior computation. In this article we present an
extension that can efficiently explore target distributions
with discontinuous densities. Our extension in particular
enables efficient sampling from ordinal parameters through
the embedding of probability mass functions into continuous
spaces. We motivate our approach through a theory of
discontinuous Hamiltonian dynamics and develop a
corresponding numerical solver. The proposed solver is the
first of its kind, with a remarkable ability to exactly
preserve the Hamiltonian. We apply our algorithm to
challenging posterior inference problems to demonstrate its
wide applicability and competitive performance.},
Doi = {10.1093/biomet/asz083},
Key = {fds350537}
}
@article{fds353002,
Author = {Binette, O and Pati, D and Dunson, DB},
Title = {Bayesian closed surface fitting through tensor
products},
Journal = {Journal of Machine Learning Research},
Volume = {21},
Pages = {1-26},
Year = {2020},
Month = {July},
Abstract = {Closed surfaces provide a useful model for 3-d shapes, with
the data typically consisting of a cloud of points in R3.
The existing literature on closed surface modeling focuses
on frequentist point estimation methods that join surface
patches along the edges, with surface patches created via
Bezier surfaces or tensor products of B-splines. However,
the resulting surfaces are not smooth along the edges and
the geometric constraints required to join the surface
patches lead to computational drawbacks. In this article, we
develop a Bayesian model for closed surfaces based on tensor
products of a cyclic basis resulting in infinitely smooth
surface realizations. We impose sparsity on the control
points through a doubleshrinkage prior. Theoretical
properties of the support of our proposed prior are studied
and it is shown that the posterior achieves the optimal rate
of convergence under reasonable assumptions on the prior.
The proposed approach is illustrated with some
examples.},
Key = {fds353002}
}
@article{fds349531,
Author = {Dunson, D and Papamarkou, T},
Title = {Discussions},
Journal = {International Statistical Review},
Volume = {88},
Number = {2},
Pages = {321-324},
Year = {2020},
Month = {August},
url = {http://dx.doi.org/10.1111/insr.12375},
Doi = {10.1111/insr.12375},
Key = {fds349531}
}
@article{fds353001,
Author = {Legramanti, S and Durante, D and Dunson, DB},
Title = {Bayesian cumulative shrinkage for infinite
factorizations.},
Journal = {Biometrika},
Volume = {107},
Number = {3},
Pages = {745-752},
Year = {2020},
Month = {September},
url = {http://dx.doi.org/10.1093/biomet/asaa008},
Abstract = {The dimension of the parameter space is typically unknown in
a variety of models that rely on factorizations. For
example, in factor analysis the number of latent factors is
not known and has to be inferred from the data. Although
classical shrinkage priors are useful in such contexts,
increasing shrinkage priors can provide a more effective
approach that progressively penalizes expansions with
growing complexity. In this article we propose a novel
increasing shrinkage prior, called the cumulative shrinkage
process, for the parameters that control the dimension in
overcomplete formulations. Our construction has broad
applicability and is based on an interpretable sequence of
spike-and-slab distributions which assign increasing mass to
the spike as the model complexity grows. Using factor
analysis as an illustrative example, we show that this
formulation has theoretical and practical advantages
relative to current competitors, including an improved
ability to recover the model dimension. An adaptive Markov
chain Monte Carlo algorithm is proposed, and the performance
gains are outlined in simulations and in an application to
personality data.},
Doi = {10.1093/biomet/asaa008},
Key = {fds353001}
}
@article{fds358047,
Author = {Sen, D and Sachs, M and Lu, J and Dunson, DB},
Title = {Efficient posterior sampling for high-dimensional imbalanced
logistic regression.},
Journal = {Biometrika},
Volume = {107},
Number = {4},
Pages = {1005-1012},
Year = {2020},
Month = {December},
url = {http://dx.doi.org/10.1093/biomet/asaa035},
Abstract = {Classification with high-dimensional data is of widespread
interest and often involves dealing with imbalanced data.
Bayesian classification approaches are hampered by the fact
that current Markov chain Monte Carlo algorithms for
posterior computation become inefficient as the number
[Formula: see text] of predictors or the number [Formula:
see text] of subjects to classify gets large, because of the
increasing computational time per step and worsening mixing
rates. One strategy is to employ a gradient-based sampler to
improve mixing while using data subsamples to reduce the
per-step computational complexity. However, the usual
subsampling breaks down when applied to imbalanced data.
Instead, we generalize piecewise-deterministic Markov chain
Monte Carlo algorithms to include importance-weighted and
mini-batch subsampling. These maintain the correct
stationary distribution with arbitrarily small subsamples
and substantially outperform current competitors. We provide
theoretical support for the proposed approach and
demonstrate its performance gains in simulated data examples
and an application to cancer data.},
Doi = {10.1093/biomet/asaa035},
Key = {fds358047}
}
@article{fds351441,
Author = {Mukhopadhyay, M and Li, D and Dunson, DB},
Title = {Estimating densities with non-linear support by using
Fisher-Gaussian kernels.},
Journal = {Journal of the Royal Statistical Society. Series B,
Statistical methodology},
Volume = {82},
Number = {5},
Pages = {1249-1271},
Year = {2020},
Month = {December},
url = {http://dx.doi.org/10.1111/rssb.12390},
Abstract = {Current tools for multivariate density estimation struggle
when the density is concentrated near a non-linear subspace
or manifold. Most approaches require the choice of a kernel,
with the multivariate Gaussian kernel by far the most
commonly used. Although heavy-tailed and skewed extensions
have been proposed, such kernels cannot capture curvature in
the support of the data. This leads to poor performance
unless the sample size is very large relative to the
dimension of the data. The paper proposes a novel
generalization of the Gaussian distribution, which includes
an additional curvature parameter. We refer to the proposed
class as Fisher-Gaussian kernels, since they arise by
sampling from a von Mises-Fisher density on the sphere and
adding Gaussian noise. The Fisher-Gaussian density has an
analytic form and is amenable to straightforward
implementation within Bayesian mixture models by using
Markov chain Monte Carlo sampling. We provide theory on
large support and illustrate gains relative to competitors
in simulated and real data applications.},
Doi = {10.1111/rssb.12390},
Key = {fds351441}
}
@article{fds354544,
Author = {Ferrari, F and Dunson, DB},
Title = {IDENTIFYING MAIN EFFECTS AND INTERACTIONS AMONG EXPOSURES
USING GAUSSIAN PROCESSES.},
Journal = {The annals of applied statistics},
Volume = {14},
Number = {4},
Pages = {1743-1758},
Year = {2020},
Month = {December},
url = {http://dx.doi.org/10.1214/20-aoas1363},
Abstract = {This article is motivated by the problem of studying the
joint effect of different chemical exposures on human health
outcomes. This is essentially a nonparametric regression
problem, with interest being focused not on a black box for
prediction but instead on selection of main effects and
interactions. For interpretability we decompose the expected
health outcome into a linear main effect, pairwise
interactions and a nonlinear deviation. Our interest is in
model selection for these different components, accounting
for uncertainty and addressing nonidentifiability between
the linear and nonparametric components of the
semiparametric model. We propose a Bayesian approach to
inference, placing variable selection priors on the
different components, and developing a Markov chain Monte
Carlo (MCMC) algorithm. A key component of our approach is
the incorporation of a heredity constraint to only include
interactions in the presence of main effects, effectively
reducing dimensionality of the model search. We adapt a
projection approach developed in the spatial statistics
literature to enforce identifiability in modeling the
nonparametric component using a Gaussian process. We also
employ a dimension reduction strategy to sample the
nonlinear random effects that aids the mixing of the MCMC
algorithm. The proposed MixSelect framework is evaluated
using a simulation study, and is illustrated using data from
the National Health and Nutrition Examination Survey
(NHANES). Code is available on GitHub.},
Doi = {10.1214/20-aoas1363},
Key = {fds354544}
}
@article{fds355000,
Author = {Li, D and Dunson, D},
Title = {Classification via local manifold approximation},
Volume = {107},
Number = {4},
Pages = {1013-1020},
Year = {2020},
Month = {December},
url = {http://dx.doi.org/10.1093/biomet/asaa033},
Abstract = {Classifiers label data as belonging to one of a set of
groups based on input features. It is challenging to achieve
accurate classification when the feature distributions in
the different classes are complex, with nonlinear,
overlapping and intersecting supports. This is particularly
true when training data are limited. To address this
problem, we propose a new type of classifier based on
obtaining a local approximation to the support of the data
within each class in a neighbourhood of the feature to be
classified, and assigning the feature to the class having
the closest support. This general algorithm is referred to
as local manifold approximation classification. As a simple
and theoretically supported special case, which is shown to
have excellent performance across a broad variety of
examples, we use spheres for local approximation, obtaining
a spherical approximation classifier.},
Doi = {10.1093/biomet/asaa033},
Key = {fds355000}
}
@article{fds358025,
Author = {Roy, A and Dunson, DB},
Title = {Nonparametric graphical model for counts.},
Journal = {Journal of machine learning research : JMLR},
Volume = {21},
Pages = {229},
Year = {2020},
Month = {December},
Abstract = {Although multivariate count data are routinely collected in
many application areas, there is surprisingly little work
developing flexible models for characterizing their
dependence structure. This is particularly true when
interest focuses on inferring the conditional independence
graph. In this article, we propose a new class of pairwise
Markov random field-type models for the joint distribution
of a multivariate count vector. By employing a novel type of
transformation, we avoid restricting to non-negative
dependence structures or inducing other restrictions through
truncations. Taking a Bayesian approach to inference, we
choose a Dirichlet process prior for the distribution of a
random effect to induce great flexibility in the
specification. An efficient Markov chain Monte Carlo (MCMC)
algorithm is developed for posterior computation. We prove
various theoretical properties, including posterior
consistency, and show that our COunt Nonparametric Graphical
Analysis (CONGA) approach has good performance relative to
competitors in simulation studies. The methods are motivated
by an application to neuron spike count data in
mice.},
Key = {fds358025}
}
@article{fds359277,
Author = {Lee, K and Lin, L and Dunson, D},
Title = {Maximum pairwise bayes factors for covariance structure
testing},
Journal = {Electronic Journal of Statistics},
Volume = {15},
Number = {2},
Pages = {4384-4419},
Year = {2021},
Month = {January},
url = {http://dx.doi.org/10.1214/21-EJS1900},
Abstract = {Hypothesis testing of structure in covariance matrices is of
sig-nificant importance, but faces great challenges in
high-dimensional settings. Although consistent frequentist
one-sample covariance tests have been pro-posed, there is a
lack of simple, computationally scalable, and theoretically
sound Bayesian testing methods for large covariance
matrices. Motivated by this gap and by the need for tests
that are powerful against sparse al-ternatives, we propose a
novel testing framework based on the maximum pairwise Bayes
factor. Our initial focus is on one-sample covariance
testing; the proposed test can optimally distinguish null
and alternative hypothe-ses in a frequentist asymptotic
sense. We then propose diagonal tests and a scalable
covariance graph selection procedure that are shown to be
con-sistent. A simulation study evaluates the proposed
approach relative to competitors. We illustrate advantages
of our graph selection method on a gene expression data
set.},
Doi = {10.1214/21-EJS1900},
Key = {fds359277}
}
@article{fds349530,
Author = {Ferrari, F and Dunson, DB},
Title = {Bayesian Factor Analysis for Inference on
Interactions.},
Journal = {Journal of the American Statistical Association},
Volume = {116},
Number = {535},
Pages = {1521-1532},
Year = {2021},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2020.1745813},
Abstract = {This article is motivated by the problem of inference on
interactions among chemical exposures impacting human health
outcomes. Chemicals often co-occur in the environment or in
synthetic mixtures and as a result exposure levels can be
highly correlated. We propose a latent factor joint model,
which includes shared factors in both the predictor and
response components while assuming conditional independence.
By including a quadratic regression in the latent variables
in the response component, we induce flexible dimension
reduction in characterizing main effects and interactions.
We propose a Bayesian approach to inference under this
Factor analysis for INteractions (FIN) framework. Through
appropriate modifications of the factor modeling structure,
FIN can accommodate higher order interactions. We evaluate
the performance using a simulation study and data from the
National Health and Nutrition Examination Survey (NHANES).
Code is available on GitHub.},
Doi = {10.1080/01621459.2020.1745813},
Key = {fds349530}
}
@article{fds355211,
Author = {Jauch, M and Hoff, PD and Dunson, DB},
Title = {Monte Carlo Simulation on the Stiefel Manifold via Polar
Expansion},
Journal = {Journal of Computational and Graphical Statistics},
Volume = {30},
Number = {3},
Pages = {622-631},
Year = {2021},
Month = {January},
url = {http://dx.doi.org/10.1080/10618600.2020.1859382},
Abstract = {Motivated by applications to Bayesian inference for
statistical models with orthogonal matrix parameters, we
present (Formula presented.) a general approach to Monte
Carlo simulation from probability distributions on the
Stiefel manifold. To bypass many of the well-established
challenges of simulating from the distribution of a random
orthogonal matrix (Formula presented.) we construct a
distribution for an unconstrained random matrix X such that
(Formula presented.) the orthogonal component of the polar
decomposition of (Formula presented.) is equal in
distribution to (Formula presented.) The distribution of X
is amenable to Markov chain Monte Carlo (MCMC) simulation
using standard methods, and an approximation to the
distribution of Q can be recovered from a Markov chain on
the unconstrained space. When combined with modern MCMC
software, polar expansion allows for routine and flexible
posterior inference in models with orthogonal matrix
parameters. We find that polar expansion with adaptive
Hamiltonian Monte Carlo is an order of magnitude more
efficient than competing MCMC approaches in a benchmark
protein interaction network application. We also propose a
new approach to Bayesian functional principal component
analysis which we illustrate in a meteorological time series
application. Supplementary materials for this article are
available online.},
Doi = {10.1080/10618600.2020.1859382},
Key = {fds355211}
}
@article{fds362556,
Author = {Roy, A and Borg, JS and Dunson, DB},
Title = {Bayesian time-aligned factor analysis of paired multivariate
time series.},
Journal = {Journal of machine learning research : JMLR},
Volume = {22},
Pages = {250},
Year = {2021},
Month = {January},
Abstract = {Many modern data sets require inference methods that can
estimate the shared and individual-specific components of
variability in collections of matrices that change over
time. Promising methods have been developed to analyze these
types of data in static cases, but only a few approaches are
available for dynamic settings. To address this gap, we
consider novel models and inference methods for pairs of
matrices in which the columns correspond to multivariate
observations at different time points. In order to
characterize common and individual features, we propose a
Bayesian dynamic factor modeling framework called Time
Aligned Common and Individual Factor Analysis (TACIFA) that
includes uncertainty in time alignment through an unknown
warping function. We provide theoretical support for the
proposed model, showing identifiability and posterior
concentration. The structure enables efficient computation
through a Hamiltonian Monte Carlo (HMC) algorithm. We show
excellent performance in simulations, and illustrate the
method through application to a social mimicry
experiment.},
Key = {fds362556}
}
@article{fds362586,
Author = {Papadogeorgou, G and Zhang, Z and Dunson, DB},
Title = {Soft tensor regression},
Journal = {Journal of Machine Learning Research},
Volume = {22},
Pages = {1-53},
Year = {2021},
Month = {January},
Abstract = {Statistical methods relating tensor predictors to scalar
outcomes in a regression model generally vectorize the
tensor predictor and estimate the coefficients of its
entries employing some form of regularization, use summaries
of the tensor covariate, or use a low dimensional
approximation of the coefficient tensor. However, low rank
approximations of the coefficient tensor can suffer if the
true rank is not small. We propose a tensor regression
framework which assumes a soft version of the parallel
factors (PARAFAC) approximation. In contrast to classic
PARAFAC where each entry of the coefficient tensor is the
sum of products of row-specific contributions across the
tensor modes, the soft tensor regression (Softer) framework
allows the row-specific contributions to vary around an
overall mean. We follow a Bayesian approach to inference,
and show that softening the PARAFAC increases model
flexibility, leads to improved estimation of coefficient
tensors, more accurate identification of important predictor
entries, and more precise predictions, even for a low
approximation rank. From a theoretical perspective, we show
that employing Softer leads to a weakly consistent posterior
distribution of the coefficient tensor, irrespective of the
true or approximation tensor rank, a result that is not true
when employing the classic PARAFAC for tensor regression. In
the context of our motivating application, we adapt Softer
to symmetric and semi-symmetric tensor predictors and
analyze the relationship between brain network
characteristics and human traits.},
Key = {fds362586}
}
@article{fds362587,
Author = {Duan, LL and Dunson, DB},
Title = {Bayesian Distance Clustering.},
Journal = {Journal of machine learning research : JMLR},
Volume = {22},
Pages = {224},
Year = {2021},
Month = {January},
Abstract = {Model-based clustering is widely used in a variety of
application areas. However, fundamental concerns remain
about robustness. In particular, results can be sensitive to
the choice of kernel representing the within-cluster data
density. Leveraging on properties of pairwise differences
between data points, we propose a class of Bayesian distance
clustering methods, which rely on modeling the likelihood of
the pairwise distances in place of the original data.
Although some information in the data is discarded, we gain
substantial robustness to modeling assumptions. The proposed
approach represents an appealing middle ground between
distance- and model-based clustering, drawing advantages
from each of these canonical approaches. We illustrate
dramatic gains in the ability to infer clusters that are not
well represented by the usual choices of kernel. A
simulation study is included to assess performance relative
to competitors, and we apply the approach to clustering of
brain genome expression data.},
Key = {fds362587}
}
@article{fds362760,
Author = {Zhu, Y and Li, C and Dunson, DB},
Title = {Classification Trees for Imbalanced Data: Surface-to-Volume
Regularization},
Journal = {Journal of the American Statistical Association},
Year = {2021},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2021.2005609},
Abstract = {Classification algorithms face difficulties when one or more
classes have limited training data. We are particularly
interested in classification trees, due to their
interpretability and flexibility. When data are limited in
one or more of the classes, the estimated decision
boundaries are often irregularly shaped due to the limited
sample size, leading to poor generalization error. We
propose a novel approach that penalizes the
Surface-to-Volume Ratio (SVR) of the decision set, obtaining
a new class of SVR-Tree algorithms. We develop a simple and
computationally efficient implementation while proving
estimation consistency for SVR-Tree and rate of convergence
for an idealized empirical risk minimizer of SVR-Tree.
SVR-Tree is compared with multiple algorithms that are
designed to deal with imbalance through real data
applications. Supplementary materials for this article are
available online.},
Doi = {10.1080/01621459.2021.2005609},
Key = {fds362760}
}
@article{fds371512,
Author = {Plummer, S and Zhou, S and Bhattacharya, A and Dunson, D and Pati,
D},
Title = {Statistical Guarantees for Transformation Based Models with
Applications to Implicit Variational Inference},
Journal = {Proceedings of Machine Learning Research},
Volume = {130},
Pages = {2449-2457},
Year = {2021},
Month = {January},
Abstract = {Transformation-based methods have been an attractive
approach in non-parametric inference for problems such as
unconditional and conditional density estimation due to
their unique hierarchical structure that models the data as
flexible transformation of a set of common latent variables.
More recently, transformation-based models have been used in
variational inference (VI) to construct flexible implicit
families of variational distributions. However, their use in
both nonparametric inference and variational inference lacks
theoretical justification. We provide theoretical
justification for the use of non-linear latent variable
models (NL-LVMs) in non-parametric inference by showing that
the support of the transformation induced prior in the space
of densities is sufficiently large in the L1 sense. We also
show that, when a Gaussian process (GP) prior is placed on
the transformation function, the posterior concentrates at
the optimal rate up to a logarithmic factor. Adopting the
flexibility demonstrated in the non-parametric setting, we
use the NL-LVM to construct an implicit family of
variational distributions, deemed GP-IVI. We delineate
sufficient conditions under which GP-IVI achieves optimal
risk bounds and approximates the true posterior in the sense
of the Kullback-Leibler divergence. To the best of our
knowledge, this is the first work on providing theoretical
guarantees for implicit variational inference.},
Key = {fds371512}
}
@article{fds356939,
Author = {Paganin, S and Herring, AH and Olshan, AF and Dunson, DB and National
Birth Defects Prevention Study},
Title = {Centered Partition Processes: Informative Priors for
Clustering (with Discussion).},
Journal = {Bayesian analysis},
Volume = {16},
Number = {1},
Pages = {301-370},
Year = {2021},
Month = {March},
url = {http://dx.doi.org/10.1214/20-ba1197},
Abstract = {There is a very rich literature proposing Bayesian
approaches for clustering starting with a prior probability
distribution on partitions. Most approaches assume
exchangeability, leading to simple representations in terms
of Exchangeable Partition Probability Functions (EPPF).
Gibbs-type priors encompass a broad class of such cases,
including Dirichlet and Pitman-Yor processes. Even though
there have been some proposals to relax the exchangeability
assumption, allowing covariate-dependence and partial
exchangeability, limited consideration has been given on how
to include concrete prior knowledge on the partition. For
example, we are motivated by an epidemiological application,
in which we wish to cluster birth defects into groups and we
have prior knowledge of an initial clustering provided by
experts. As a general approach for including such prior
knowledge, we propose a Centered Partition (CP) process that
modifies the EPPF to favor partitions close to an initial
one. Some properties of the CP prior are described, a
general algorithm for posterior computation is developed,
and we illustrate the methodology through simulation
examples and an application to the motivating epidemiology
study of birth defects.},
Doi = {10.1214/20-ba1197},
Key = {fds356939}
}
@article{fds355488,
Author = {Moran, KR and Turner, EL and Dunson, D and Herring,
AH},
Title = {Bayesian hierarchical factor regression models to infer
cause of death from verbal autopsy data.},
Journal = {J R Stat Soc Ser C Appl Stat},
Volume = {70},
Number = {3},
Pages = {532-557},
Year = {2021},
Month = {June},
url = {http://dx.doi.org/10.1111/rssc.12468},
Abstract = {In low-resource settings where vital registration of death
is not routine it is often of critical interest to determine
and study the cause of death (COD) for individuals and the
cause-specific mortality fraction (CSMF) for populations.
Post-mortem autopsies, considered the gold standard for COD
assignment, are often difficult or impossible to implement
due to deaths occurring outside the hospital, expense,
and/or cultural norms. For this reason, Verbal Autopsies
(VAs) are commonly conducted, consisting of a questionnaire
administered to next of kin recording demographic
information, known medical conditions, symptoms, and other
factors for the decedent. This article proposes a novel
class of hierarchical factor regression models that avoid
restrictive assumptions of standard methods, allow both the
mean and covariance to vary with COD category, and can
include covariate information on the decedent, region, or
events surrounding death. Taking a Bayesian approach to
inference, this work develops an MCMC algorithm and
validates the FActor Regression for Verbal Autopsy (FARVA)
model in simulation experiments. An application of FARVA to
real VA data shows improved goodness-of-fit and better
predictive performance in inferring COD and CSMF over
competing methods. Code and a user manual are made available
at https://github.com/kelrenmor/farva.},
Doi = {10.1111/rssc.12468},
Key = {fds355488}
}
@article{fds365277,
Author = {VAN DEN Boom and W and Reeves, G and Dunson, DB},
Title = {Approximating posteriors with high-dimensional nuisance
parameters via integrated rotated Gaussian
approximation.},
Journal = {Biometrika},
Volume = {108},
Number = {2},
Pages = {269-282},
Year = {2021},
Month = {June},
url = {http://dx.doi.org/10.1093/biomet/asaa068},
Abstract = {Posterior computation for high-dimensional data with many
parameters can be challenging. This article focuses on a new
method for approximating posterior distributions of a low-
to moderate-dimensional parameter in the presence of a
high-dimensional or otherwise computationally challenging
nuisance parameter. The focus is on regression models and
the key idea is to separate the likelihood into two
components through a rotation. One component involves only
the nuisance parameters, which can then be integrated out
using a novel type of Gaussian approximation. We provide
theory on approximation accuracy that holds for a broad
class of forms of the nuisance component and priors.
Applying our method to simulated and real data sets shows
that it can outperform state-of-the-art posterior
approximation approaches.},
Doi = {10.1093/biomet/asaa068},
Key = {fds365277}
}
@article{fds357659,
Author = {Aliverti, E and Lum, K and Johndrow, JE and Dunson,
DB},
Title = {Removing the influence of group variables in
high-dimensional predictive modelling.},
Journal = {Journal of the Royal Statistical Society. Series A,
(Statistics in Society)},
Volume = {184},
Number = {3},
Pages = {791-811},
Year = {2021},
Month = {July},
url = {http://dx.doi.org/10.1111/rssa.12613},
Abstract = {In many application areas, predictive models are used to
support or make important decisions. There is increasing
awareness that these models may contain spurious or
otherwise undesirable correlations. Such correlations may
arise from a variety of sources, including batch effects,
systematic measurement errors, or sampling bias. Without
explicit adjustment, machine learning algorithms trained
using these data can produce poor out-of-sample predictions
which propagate these undesirable correlations. We propose a
method to pre-process the training data, producing an
adjusted dataset that is statistically independent of the
nuisance variables with minimum information loss. We develop
a conceptually simple approach for creating an adjusted
dataset in high-dimensional settings based on a constrained
form of matrix decomposition. The resulting dataset can then
be used in any predictive algorithm with the guarantee that
predictions will be statistically independent of the group
variable. We develop a scalable algorithm for implementing
the method, along with theory support in the form of
independence guarantees and optimality. The method is
illustrated on some simulation examples and applied to two
case studies: removing machine-specific correlations from
brain scan data, and removing race and ethnicity information
from a dataset used to predict recidivism. That the
motivation for removing undesirable correlations is quite
different in the two applications illustrates the broad
applicability of our approach.},
Doi = {10.1111/rssa.12613},
Key = {fds357659}
}
@article{fds362585,
Author = {Roy, A and Lavine, I and Herring, AH and Dunson, DB},
Title = {PERTURBED FACTOR ANALYSIS: ACCOUNTING FOR GROUP DIFFERENCES
IN EXPOSURE PROFILES.},
Journal = {The annals of applied statistics},
Volume = {15},
Number = {3},
Pages = {1386-1404},
Year = {2021},
Month = {September},
url = {http://dx.doi.org/10.1214/20-aoas1435},
Abstract = {In this article we investigate group differences in
phthalate exposure profiles using NHANES data. Phthalates
are a family of industrial chemicals used in plastics and as
solvents. There is increasing evidence of adverse health
effects of exposure to phthalates on reproduction and
neurodevelopment and concern about racial disparities in
exposure. We would like to identify a single set of
low-dimensional factors summarizing exposure to different
chemicals, while allowing differences across groups.
Improving on current multigroup additive factor models, we
propose a class of Perturbed Factor Analysis (PFA) models
that assume a common factor structure after perturbing the
data via multiplication by a group-specific matrix. Bayesian
inference algorithms are defined using a matrix normal
hierarchical model for the perturbation matrices. The
resulting model is just as flexible as current approaches in
allowing arbitrarily large differences across groups but has
substantial advantages that we illustrate in simulation
studies. Applying PFA to NHANES data, we learn common
factors summarizing exposures to phthalates, while showing
clear differences across groups.},
Doi = {10.1214/20-aoas1435},
Key = {fds362585}
}
@article{fds360021,
Author = {Moran, KR and Dunson, D and Wheeler, MW and Herring,
AH},
Title = {BAYESIAN JOINT MODELING OF CHEMICAL STRUCTURE AND DOSE
RESPONSE CURVES.},
Journal = {The annals of applied statistics},
Volume = {15},
Number = {3},
Pages = {1405-1430},
Year = {2021},
Month = {September},
url = {http://dx.doi.org/10.1214/21-aoas1461},
Abstract = {Today there are approximately 85,000 chemicals regulated
under the Toxic Substances Control Act, with around 2,000
new chemicals introduced each year. It is impossible to
screen all of these chemicals for potential toxic effects,
either via full organism <i>in vivo</i> studies or <i>in
vitro</i> high-throughput screening (HTS) programs.
Toxicologists face the challenge of choosing which chemicals
to screen, and predicting the toxicity of as yet unscreened
chemicals. Our goal is to describe how variation in chemical
structure relates to variation in toxicological response to
enable <i>in silico</i> toxicity characterization designed
to meet both of these challenges. With our Bayesian
partially Supervised Sparse and Smooth Factor Analysis
(BS<sup>3</sup>FA) model, we learn a distance between
chemicals targeted to toxicity, rather than one based on
molecular structure alone. Our model also enables the
prediction of chemical dose-response profiles based on
chemical structure (i.e., without <i>in vivo</i> or <i>in
vitro</i> testing) by taking advantage of a large database
of chemicals that have already been tested for toxicity in
HTS programs. We show superior simulation performance in
distance learning and modest to large gains in predictive
ability compared to existing methods. Results from the
high-throughput screening data application elucidate the
relationship between chemical structure and a
toxicity-relevant high-throughput assay. An R package for
BS<sup>3</sup>FA is available online at https://github.com/kelrenmor/bs3fa.},
Doi = {10.1214/21-aoas1461},
Key = {fds360021}
}
@article{fds357954,
Author = {Dunson, DB and Wu, HT and Wu, N},
Title = {Spectral convergence of graph Laplacian and heat kernel
reconstruction in L∞ from random
samples},
Journal = {Applied and Computational Harmonic Analysis},
Volume = {55},
Pages = {282-336},
Year = {2021},
Month = {November},
url = {http://dx.doi.org/10.1016/j.acha.2021.06.002},
Abstract = {In the manifold setting, we provide a series of spectral
convergence results quantifying how the eigenvectors and
eigenvalues of the graph Laplacian converge to the
eigenfunctions and eigenvalues of the Laplace-Beltrami
operator in the L∞ sense. Based on these results,
convergence of the proposed heat kernel approximation
algorithm, as well as the convergence rate, to the exact
heat kernel is guaranteed. To our knowledge, this is the
first work exploring the spectral convergence in the L∞
sense and providing a numerical heat kernel reconstruction
from the point cloud with theoretical guarantees.},
Doi = {10.1016/j.acha.2021.06.002},
Key = {fds357954}
}
@article{fds360020,
Author = {Liu, M and Zhang, Z and Dunson, DB},
Title = {Graph auto-encoding brain networks with applications to
analyzing large-scale brain imaging datasets.},
Journal = {NeuroImage},
Volume = {245},
Pages = {118750},
Year = {2021},
Month = {December},
url = {http://dx.doi.org/10.1016/j.neuroimage.2021.118750},
Abstract = {There has been a huge interest in studying human brain
connectomes inferred from different imaging modalities and
exploring their relationships with human traits, such as
cognition. Brain connectomes are usually represented as
networks, with nodes corresponding to different regions of
interest (ROIs) and edges to connection strengths between
ROIs. Due to the high-dimensionality and non-Euclidean
nature of networks, it is challenging to depict their
population distribution and relate them to human traits.
Current approaches focus on summarizing the network using
either pre-specified topological features or principal
components analysis (PCA). In this paper, building on recent
advances in deep learning, we develop a nonlinear latent
factor model to characterize the population distribution of
brain graphs and infer their relationships to human traits.
We refer to our method as Graph AuTo-Encoding (GATE). We
applied GATE to two large-scale brain imaging datasets, the
Adolescent Brain Cognitive Development (ABCD) study and the
Human Connectome Project (HCP) for adults, to study the
structural brain connectome and its relationship with
cognition. Numerical results demonstrate huge advantages of
GATE over competitors in terms of prediction accuracy,
statistical inference, and computing efficiency. We found
that the structural connectome has a stronger association
with a wide range of human cognitive traits than was
apparent using previous approaches.},
Doi = {10.1016/j.neuroimage.2021.118750},
Key = {fds360020}
}
@article{fds365276,
Author = {Badea, A and Li, D and Niculescu, AR and Anderson, RJ and Stout, JA and Williams, CL and Colton, CA and Maeda, N and Dunson,
DB},
Title = {Absolute Winding Number Differentiates Mouse Spatial
Navigation Strategies With Genetic Risk for Alzheimer's
Disease.},
Journal = {Front Neurosci},
Volume = {16},
Pages = {848654},
Year = {2022},
url = {http://dx.doi.org/10.3389/fnins.2022.848654},
Abstract = {Spatial navigation and orientation are emerging as promising
markers for altered cognition in prodromal Alzheimer's
disease, and even in cognitively normal individuals at risk
for Alzheimer's disease. The different APOE gene alleles
confer various degrees of risk. The APOE2 allele is
considered protective, APOE3 is seen as control, while APOE4
carriage is the major known genetic risk for Alzheimer's
disease. We have used mouse models carrying the three
humanized APOE alleles and tested them in a spatial memory
task in the Morris water maze. We introduce a new metric,
the absolute winding number, to characterize the spatial
search strategy, through the shape of the swim path. We show
that this metric is robust to noise, and works for small
group samples. Moreover, the absolute winding number better
differentiated APOE3 carriers, through their straighter swim
paths relative to both APOE2 and APOE4 genotypes. Finally,
this novel metric supported increased vulnerability in APOE4
females. We hypothesized differences in spatial memory and
navigation strategies are linked to differences in brain
networks, and showed that different genotypes have different
reliance on the hippocampal and caudate putamen circuits,
pointing to a role for white matter connections. Moreover,
differences were most pronounced in females. This departure
from a hippocampal centric to a brain network approach may
open avenues for identifying regions linked to increased
risk for Alzheimer's disease, before overt disease
manifestation. Further exploration of novel biomarkers based
on spatial navigation strategies may enlarge the windows of
opportunity for interventions. The proposed framework will
be significant in dissecting vulnerable circuits associated
with cognitive changes in prodromal Alzheimer's
disease.},
Doi = {10.3389/fnins.2022.848654},
Key = {fds365276}
}
@article{fds368076,
Author = {Badea, A and Li, D and Niculescu, AR and Anderson, RJ and Stout, JA and Williams, CL and Colton, CA and Maeda, N and Dunson,
DB},
Title = {Corrigendum: Absolute winding number differentiates mouse
spatial navigation strategies with genetic risk for
Alzheimer's disease.},
Journal = {Front Neurosci},
Volume = {16},
Pages = {1070425},
Year = {2022},
url = {http://dx.doi.org/10.3389/fnins.2022.1070425},
Abstract = {[This corrects the article DOI: 10.3389/fnins.2022.848654.].},
Doi = {10.3389/fnins.2022.1070425},
Key = {fds368076}
}
@article{fds362554,
Author = {Joubert, BR and Kioumourtzoglou, M-A and Chamberlain, T and Chen, HY and Gennings, C and Turyk, ME and Miranda, ML and Webster, TF and Ensor, KB and Dunson, DB and Coull, BA},
Title = {Powering Research through Innovative Methods for Mixtures in
Epidemiology (PRIME) Program: Novel and Expanded Statistical
Methods.},
Journal = {International journal of environmental research and public
health},
Volume = {19},
Number = {3},
Pages = {1378},
Year = {2022},
Month = {January},
url = {http://dx.doi.org/10.3390/ijerph19031378},
Abstract = {Humans are exposed to a diverse mixture of chemical and
non-chemical exposures across their lifetimes. Well-designed
epidemiology studies as well as sophisticated exposure
science and related technologies enable the investigation of
the health impacts of mixtures. While existing statistical
methods can address the most basic questions related to the
association between environmental mixtures and health
endpoints, there were gaps in our ability to learn from
mixtures data in several common epidemiologic scenarios,
including high correlation among health and exposure
measures in space and/or time, the presence of missing
observations, the violation of important modeling
assumptions, and the presence of computational challenges
incurred by current implementations. To address these and
other challenges, NIEHS initiated the Powering Research
through Innovative methods for Mixtures in Epidemiology
(PRIME) program, to support work on the development and
expansion of statistical methods for mixtures. Six
independent projects supported by PRIME have been highly
productive but their methods have not yet been described
collectively in a way that would inform application. We
review 37 new methods from PRIME projects and summarize the
work across previously published research questions, to
inform methods selection and increase awareness of these new
methods. We highlight important statistical advancements
considering data science strategies, exposure-response
estimation, timing of exposures, epidemiological methods,
the incorporation of toxicity/chemical information,
spatiotemporal data, risk assessment, and model performance,
efficiency, and interpretation. Importantly, we link to
software to encourage application and testing on other
datasets. This review can enable more informed analyses of
environmental mixtures. We stress training for early career
scientists as well as innovation in statistical methodology
as an ongoing need. Ultimately, we direct efforts to the
common goal of reducing harmful exposures to improve public
health.},
Doi = {10.3390/ijerph19031378},
Key = {fds362554}
}
@article{fds362728,
Author = {Peruzzi, M and Dunson, DB},
Title = {Spatial Multivariate Trees for Big Data Bayesian
Regression.},
Journal = {Journal of machine learning research : JMLR},
Volume = {23},
Pages = {17},
Year = {2022},
Month = {January},
Abstract = {High resolution geospatial data are challenging because
standard geostatistical models based on Gaussian processes
are known to not scale to large data sizes. While progress
has been made towards methods that can be computed more
efficiently, considerably less attention has been devoted to
methods for large scale data that allow the description of
complex relationships between several outcomes recorded at
high resolutions by different sensors. Our Bayesian
multivariate regression models based on spatial multivariate
trees (SpamTrees) achieve scalability via conditional
independence assumptions on latent random effects following
a treed directed acyclic graph. Information-theoretic
arguments and considerations on computational efficiency
guide the construction of the tree and the related efficient
sampling algorithms in imbalanced multivariate settings. In
addition to simulated data examples, we illustrate SpamTrees
using a large climate data set which combines satellite data
with land-based station data. Software and source code are
available on CRAN at https://CRAN.R-project.org/package=spamtree.},
Key = {fds362728}
}
@article{fds363849,
Author = {Zhang, R and Mak, S and Dunson, D},
Title = {GAUSSIAN PROCESS SUBSPACE PREDICTION FOR MODEL
REDUCTION},
Journal = {SIAM Journal on Scientific Computing},
Volume = {44},
Number = {3},
Pages = {A1428-A1449},
Year = {2022},
Month = {January},
url = {http://dx.doi.org/10.1137/21M1432739},
Abstract = {Subspace-valued functions arise in a wide range of problems,
including parametric reduced order modeling (PROM),
parameter reduction, and subspace tracking. In PROM, each
parameter point can be associated with a subspace, which is
used for Petrov–Galerkin projections of large system
matrices. Previous efforts to approximate such functions use
interpolations on manifolds, which can be inaccurate and
slow. To tackle this, we propose a novel Bayesian
nonparametric model for subspace prediction: the Gaussian
process subspace (GPS) model. This method is extrinsic and
intrinsic at the same time: with multivariate Gaussian
distributions on the Euclidean space, it induces a joint
probability model on the Grassmann manifold, the set of
fixed-dimensional subspaces. The GPS adopts a simple yet
general correlation structure, and a principled approach for
model selection. Its predictive distribution admits an
analytical form, which allows for efficient subspace
prediction over the parameter space. For PROM, the GPS
provides a probabilistic prediction at a new parameter point
that retains the accuracy of local reduced models, at a
computational complexity that does not depend on system
dimension, and thus is suitable for online computation. We
give four numerical examples to compare our method to
subspace interpolation, as well as two methods that
interpolate local reduced models. Overall, GPS is the most
data efficient, more computationally efficient than subspace
interpolation, and gives smooth predictions with uncertainty
quantification.},
Doi = {10.1137/21M1432739},
Key = {fds363849}
}
@article{fds362826,
Author = {Van Den Boom and W and Reeves, G and Dunson, DB},
Title = {Erratum: Approximating posteriors with high-dimensional
nuisance parameters via integrated rotated Gaussian
approximation (Biometrika (2021) 108 (269-282) DOI:
10.1093/biomet/asaa068)},
Journal = {Biometrika},
Volume = {109},
Number = {1},
Pages = {275},
Year = {2022},
Month = {March},
url = {http://dx.doi.org/10.1093/biomet/asab019},
Abstract = {In the main paper under subsection -3.2. Bayesian variable
selection-, all references to -5.2- should read: -3.1-.
Under subsection -5.2. Bayesian variable selection-, the
reference to -5.3 and 6- should read: -S5.3 and S6-. These
errors have now been corrected.},
Doi = {10.1093/biomet/asab019},
Key = {fds362826}
}
@article{fds365275,
Author = {Russo, M and Singer, BH and Dunson, DB},
Title = {MULTIVARIATE MIXED MEMBERSHIP MODELING: INFERRING
DOMAIN-SPECIFIC RISK PROFILES.},
Journal = {The annals of applied statistics},
Volume = {16},
Number = {1},
Pages = {391-413},
Year = {2022},
Month = {March},
url = {http://dx.doi.org/10.1214/21-aoas1496},
Abstract = {Characterizing the shared memberships of individuals in a
classification scheme poses severe interpretability issues,
even when using a moderate number of classes (say 4). Mixed
membership models quantify this phenomenon, but they
typically focus on goodness-of-fit more than on
interpretable inference. To achieve a good numerical fit,
these models may in fact require many extreme profiles,
making the results difficult to interpret. We introduce a
new class of multivariate mixed membership models that, when
variables can be partitioned into subject-matter based
domains, can provide a good fit to the data using fewer
profiles than standard formulations. The proposed model
explicitly accounts for the blocks of variables
corresponding to the distinct domains along with a
cross-domain correlation structure, which provides new
information about shared membership of individuals in a
complex classification scheme. We specify a multivariate
logistic normal distribution for the membership vectors,
which allows easy introduction of auxiliary information
leveraging a latent multivariate logistic regression. A
Bayesian approach to inference, relying on Pólya gamma data
augmentation, facilitates efficient posterior computation
via Markov Chain Monte Carlo. We apply this methodology to a
spatially explicit study of malaria risk over time on the
Brazilian Amazon frontier.},
Doi = {10.1214/21-aoas1496},
Key = {fds365275}
}
@article{fds362555,
Author = {Dunson, DB and Wu, HT and Wu, N},
Title = {Graph based Gaussian processes on restricted
domains},
Journal = {Journal of the Royal Statistical Society. Series B:
Statistical Methodology},
Volume = {84},
Number = {2},
Pages = {414-439},
Year = {2022},
Month = {April},
url = {http://dx.doi.org/10.1111/rssb.12486},
Abstract = {In nonparametric regression, it is common for the inputs to
fall in a restricted subset of Euclidean space. Typical
kernel-based methods that do not take into account the
intrinsic geometry of the domain across which observations
are collected may produce sub-optimal results. In this
article, we focus on solving this problem in the context of
Gaussian process (GP) models, proposing a new class of Graph
Laplacian based GPs (GL-GPs), which learn a covariance that
respects the geometry of the input domain. As the heat
kernel is intractable computationally, we approximate the
covariance using finitely-many eigenpairs of the Graph
Laplacian (GL). The GL is constructed from a kernel which
depends only on the Euclidean coordinates of the inputs.
Hence, we can benefit from the full knowledge about the
kernel to extend the covariance structure to newly arriving
samples by a Nyström type extension. We provide substantial
theoretical support for the GL-GP methodology, and
illustrate performance gains in various applications.},
Doi = {10.1111/rssb.12486},
Key = {fds362555}
}
@article{fds362729,
Author = {Lum, K and Dunson, DB and Johndrow, J},
Title = {Closer than they appear: A Bayesian perspective on
individual-level heterogeneity in risk assessment},
Journal = {Journal of the Royal Statistical Society. Series A:
Statistics in Society},
Volume = {185},
Number = {2},
Pages = {588-614},
Year = {2022},
Month = {April},
url = {http://dx.doi.org/10.1111/rssa.12792},
Abstract = {Risk assessment instruments are used across the criminal
justice system to estimate the probability of some future
event, such as failure to appear for a court appointment or
re-arrest. The estimated probabilities are then used in
making decisions at the individual level. In the past, there
has been controversy about whether the probabilities derived
from group-level calculations can meaningfully be applied to
individuals. Using Bayesian hierarchical models applied to a
large longitudinal dataset from the court system in the
state of Kentucky, we analyse variation in individual-level
probabilities of failing to appear for court and the extent
to which it is captured by covariates. We find that
individuals within the same risk group vary widely in their
probability of the outcome. In practice, this means that
allocating individuals to risk groups based on standard
approaches to risk assessment, in large part, results in
creating distinctions among individuals who are not
meaningfully different in terms of their likelihood of the
outcome. This is because uncertainty about the probability
that any particular individual will fail to appear is large
relative to the difference in average probabilities among
any reasonable set of risk groups.},
Doi = {10.1111/rssa.12792},
Key = {fds362729}
}
@article{fds362990,
Author = {Guha, S and Jung, R and Dunson, D},
Title = {Predicting phenotypes from brain connection
structure},
Journal = {Journal of the Royal Statistical Society. Series C: Applied
Statistics},
Volume = {71},
Number = {3},
Pages = {639-668},
Year = {2022},
Month = {June},
url = {http://dx.doi.org/10.1111/rssc.12549},
Abstract = {This article focuses on the problem of predicting a response
variable based on a network-valued predictor. Our motivation
is the development of interpretable and accurate predictive
models for cognitive traits and neuro-psychiatric disorders
based on an individual's brain connection network
(connectome). Current methods reduce the complex,
high-dimensional brain network into low-dimensional
pre-specified features prior to applying standard predictive
algorithms. These methods are sensitive to feature choice
and inevitably discard important information. Instead, we
propose a nonparametric Bayes class of models that utilize
the entire adjacency matrix defining brain region
connections to adaptively detect predictive algorithms,
while maintaining interpretability. The Bayesian
Connectomics (BaCon) model class utilizes
Poisson–Dirichlet processes to find a lower dimensional,
bidirectional (covariate, subject) pattern in the adjacency
matrix. The small n, large p problem is transformed into a
‘small n, small q’ problem, facilitating an effective
stochastic search of the predictors. A spike-and-slab prior
for the cluster predictors strikes a balance between
regression model parsimony and flexibility, resulting in
improved inferences and test case predictions. We describe
basic properties of the BaCon model and develop efficient
algorithms for posterior computation. The resulting methods
are found to outperform existing approaches and applied to a
creative reasoning dataset.},
Doi = {10.1111/rssc.12549},
Key = {fds362990}
}
@article{fds365274,
Author = {Aliverti, E and Dunson, DB},
Title = {COMPOSITE MIXTURE OF LOG-LINEAR MODELS WITH APPLICATION TO
PSYCHIATRIC STUDIES.},
Journal = {The annals of applied statistics},
Volume = {16},
Number = {2},
Pages = {765-790},
Year = {2022},
Month = {June},
url = {http://dx.doi.org/10.1214/21-aoas1515},
Abstract = {Psychiatric studies of suicide provide fundamental insights
on the evolution of severe psychopathologies, and contribute
to the development of early treatment interventions. Our
focus is on modelling different traits of psychosis and
their interconnections, focusing on a case study on suicide
attempt survivors. Such aspects are recorded via
multivariate categorical data, involving a large numbers of
items for multiple subjects. Current methods for
multivariate categorical data-such as penalized log-linear
models and latent structure analysis-are either limited to
low-dimensional settings or include parameters with
difficult interpretation. Motivated by this application,
this article proposes a new class of approaches, which we
refer to as Mixture of Log Linear models (mills). Combining
latent class analysis and log-linear models, mills defines a
novel Bayesian approach to model complex multivariate
categorical data with flexibility and interpretability,
providing interesting insights on the relationship between
psychotic diseases and psychological aspects in suicide
attempt survivors.},
Doi = {10.1214/21-aoas1515},
Key = {fds365274}
}
@article{fds365798,
Author = {Dey, P and Zhang, Z and Dunson, DB},
Title = {Outlier detection for multi-network data.},
Journal = {Bioinformatics (Oxford, England)},
Volume = {38},
Number = {16},
Pages = {4011-4018},
Year = {2022},
Month = {August},
url = {http://dx.doi.org/10.1093/bioinformatics/btac431},
Abstract = {<h4>Motivation</h4>It has become routine in neuroscience
studies to measure brain networks for different individuals
using neuroimaging. These networks are typically expressed
as adjacency matrices, with each cell containing a summary
of connectivity between a pair of brain regions. There is an
emerging statistical literature describing methods for the
analysis of such multi-network data in which nodes are
common across networks but the edges vary. However, there
has been essentially no consideration of the important
problem of outlier detection. In particular, for certain
subjects, the neuroimaging data are so poor quality that the
network cannot be reliably reconstructed. For such subjects,
the resulting adjacency matrix may be mostly zero or exhibit
a bizarre pattern not consistent with a functioning brain.
These outlying networks may serve as influential points,
contaminating subsequent statistical analyses. We propose a
simple Outlier DetectIon for Networks (ODIN) method relying
on an influence measure under a hierarchical generalized
linear model for the adjacency matrices. An efficient
computational algorithm is described, and ODIN is
illustrated through simulations and an application to data
from the UK Biobank.<h4>Results</h4>ODIN was successful in
identifying moderate to extreme outliers. Removing such
outliers can significantly change inferences in downstream
applications.<h4>Availability and implementation</h4>ODIN
has been implemented in both Python and R and these
implementations along with other code are publicly available
at github.com/pritamdey/ODIN-python and github.com/pritamdey/ODIN-r,
respectively.<h4>Supplementary information</h4>Supplementary
data are available at Bioinformatics online.},
Doi = {10.1093/bioinformatics/btac431},
Key = {fds365798}
}
@article{fds365239,
Author = {Chakraborty, A and Ovaskainen, O and Dunson, DB},
Title = {BAYESIAN SEMIPARAMETRIC LONG MEMORY MODELS FOR DISCRETIZED
EVENT DATA.},
Journal = {The annals of applied statistics},
Volume = {16},
Number = {3},
Pages = {1380-1399},
Year = {2022},
Month = {September},
url = {http://dx.doi.org/10.1214/21-aoas1546},
Abstract = {We introduce a new class of semiparametric latent variable
models for long memory discretized event data. The proposed
methodology is motivated by a study of bird vocalizations in
the Amazon rain forest; the timings of vocalizations exhibit
self-similarity and long range dependence. This rules out
Poisson process based models where the rate function itself
is not long range dependent. The proposed class of
FRActional Probit (FRAP) models is based on thresholding, a
latent process. This latent process is modeled by a smooth
Gaussian process and a fractional Brownian motion by
assuming an additive structure. We develop a Bayesian
approach to inference using Markov chain Monte Carlo and
show good performance in simulation studies. Applying the
methods to the Amazon bird vocalization data, we find
substantial evidence for self-similarity and
non-Markovian/Poisson dynamics. To accommodate the bird
vocalization data in which there are many different species
of birds exhibiting their own vocalization dynamics, a
hierarchical expansion of FRAP is provided in the
Supplementary Material.},
Doi = {10.1214/21-aoas1546},
Key = {fds365239}
}
@article{fds367231,
Author = {Melikechi, O and Young, AL and Tang, T and Bowman, T and Dunson, D and Johndrow, J},
Title = {Limits of epidemic prediction using SIR models.},
Journal = {Journal of mathematical biology},
Volume = {85},
Number = {4},
Pages = {36},
Year = {2022},
Month = {September},
url = {http://dx.doi.org/10.1007/s00285-022-01804-5},
Abstract = {The Susceptible-Infectious-Recovered (SIR) equations and
their extensions comprise a commonly utilized set of models
for understanding and predicting the course of an epidemic.
In practice, it is of substantial interest to estimate the
model parameters based on noisy observations early in the
outbreak, well before the epidemic reaches its peak. This
allows prediction of the subsequent course of the epidemic
and design of appropriate interventions. However, accurately
inferring SIR model parameters in such scenarios is
problematic. This article provides novel, theoretical
insight on this issue of practical identifiability of the
SIR model. Our theory provides new understanding of the
inferential limits of routinely used epidemic models and
provides a valuable addition to current simulate-and-check
methods. We illustrate some practical implications through
application to a real-world epidemic data
set.},
Doi = {10.1007/s00285-022-01804-5},
Key = {fds367231}
}
@article{fds370544,
Author = {Schiavon, L and Canale, A and Dunson, DB},
Title = {Generalized infinite factorization models.},
Journal = {Biometrika},
Volume = {109},
Number = {3},
Pages = {817-835},
Year = {2022},
Month = {September},
url = {http://dx.doi.org/10.1093/biomet/asab056},
Abstract = {Factorization models express a statistical object of
interest in terms of a collection of simpler objects. For
example, a matrix or tensor can be expressed as a sum of
rank-one components. However, in practice, it can be
challenging to infer the relative impact of the different
components as well as the number of components. A popular
idea is to include infinitely many components having impact
decreasing with the component index. This article is
motivated by two limitations of existing methods: (1) lack
of careful consideration of the within component sparsity
structure; and (2) no accommodation for grouped variables
and other non-exchangeable structures. We propose a general
class of infinite factorization models that address these
limitations. Theoretical support is provided, practical
gains are shown in simulation studies, and an ecology
application focusing on modelling bird species occurrence is
discussed.},
Doi = {10.1093/biomet/asab056},
Key = {fds370544}
}
@article{fds367372,
Author = {Legramanti, S and Rigon, T and Durante, D and Dunson,
DB},
Title = {EXTENDED STOCHASTIC BLOCK MODELS WITH APPLICATION TO
CRIMINAL NETWORKS.},
Journal = {The annals of applied statistics},
Volume = {16},
Number = {4},
Pages = {2369-2395},
Year = {2022},
Month = {December},
url = {http://dx.doi.org/10.1214/21-aoas1595},
Abstract = {Reliably learning group structures among nodes in network
data is challenging in several applications. We are
particularly motivated by studying covert networks that
encode relationships among criminals. These data are subject
to measurement errors, and exhibit a complex combination of
an unknown number of core-periphery, assortative and
disassortative structures that may unveil key architectures
of the criminal organization. The coexistence of these noisy
block patterns limits the reliability of routinely-used
community detection algorithms, and requires extensions of
model-based solutions to realistically characterize the node
partition process, incorporate information from node
attributes, and provide improved strategies for estimation
and uncertainty quantification. To cover these gaps, we
develop a new class of extended stochastic block models
(esbm) that infer groups of nodes having common connectivity
patterns via Gibbs-type priors on the partition process.
This choice encompasses many realistic priors for criminal
networks, covering solutions with fixed, random and infinite
number of possible groups, and facilitates the inclusion of
node attributes in a principled manner. Among the new
alternatives in our class, we focus on the Gnedin process as
a realistic prior that allows the number of groups to be
finite, random and subject to a reinforcement process
coherent with criminal networks. A collapsed Gibbs sampler
is proposed for the whole esbm class, and refined strategies
for estimation, prediction, uncertainty quantification and
model selection are outlined. The esbm performance is
illustrated in realistic simulations and in an application
to an Italian mafia network, where we unveil key complex
block structures, mostly hidden from state-of-the-art
alternatives.},
Doi = {10.1214/21-aoas1595},
Key = {fds367372}
}
@article{fds370635,
Author = {Young, AL and van den Boom, W and Schroeder, RA and Krishnamoorthy,
V and Raghunathan, K and Wu, H-T and Dunson, DB},
Title = {Mutual information: Measuring nonlinear dependence in
longitudinal epidemiological data.},
Journal = {PLoS One},
Volume = {18},
Number = {4},
Pages = {e0284904},
Year = {2023},
url = {http://dx.doi.org/10.1371/journal.pone.0284904},
Abstract = {Given a large clinical database of longitudinal patient
information including many covariates, it is computationally
prohibitive to consider all types of interdependence between
patient variables of interest. This challenge motivates the
use of mutual information (MI), a statistical summary of
data interdependence with appealing properties that make it
a suitable alternative or addition to correlation for
identifying relationships in data. MI: (i) captures all
types of dependence, both linear and nonlinear, (ii) is zero
only when random variables are independent, (iii) serves as
a measure of relationship strength (similar to but more
general than R2), and (iv) is interpreted the same way for
numerical and categorical data. Unfortunately, MI typically
receives little to no attention in introductory statistics
courses and is more difficult than correlation to estimate
from data. In this article, we motivate the use of MI in the
analyses of epidemiologic data, while providing a general
introduction to estimation and interpretation. We illustrate
its utility through a retrospective study relating
intraoperative heart rate (HR) and mean arterial pressure
(MAP). We: (i) show postoperative mortality is associated
with decreased MI between HR and MAP and (ii) improve
existing postoperative mortality risk assessment by
including MI and additional hemodynamic statistics.},
Doi = {10.1371/journal.pone.0284904},
Key = {fds370635}
}
@article{fds365095,
Author = {Zito, A and Rigon, T and Ovaskainen, O and Dunson,
DB},
Title = {Bayesian Modeling of Sequential Discoveries.},
Journal = {Journal of the American Statistical Association},
Volume = {118},
Number = {544},
Pages = {2521-2532},
Year = {2023},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2022.2060835},
Abstract = {We aim at modeling the appearance of distinct tags in a
sequence of labeled objects. Common examples of this type of
data include words in a corpus or distinct species in a
sample. These sequential discoveries are often summarized
via accumulation curves, which count the number of distinct
entities observed in an increasingly large set of objects.
We propose a novel Bayesian method for species sampling
modeling by directly specifying the probability of a new
discovery, therefore, allowing for flexible specifications.
The asymptotic behavior and finite sample properties of such
an approach are extensively studied. Interestingly, our
enlarged class of sequential processes includes highly
tractable special cases. We present a subclass of models
characterized by appealing theoretical and computational
properties, including one that shares the same discovery
probability with the Dirichlet process. Moreover, due to
strong connections with logistic regression models, the
latter subclass can naturally account for covariates. We
finally test our proposal on both synthetic and real data,
with special emphasis on a large fungal biodiversity study
in Finland. Supplementary materials for this article are
available online.},
Doi = {10.1080/01621459.2022.2060835},
Key = {fds365095}
}
@article{fds371511,
Author = {Papadogeorgou, G and Bello, C and Ovaskainen, O and Dunson,
DB},
Title = {Covariate-Informed Latent Interaction Models: Addressing
Geographic & Taxonomic Bias in Predicting Bird–Plant
Interactions},
Journal = {Journal of the American Statistical Association},
Volume = {118},
Number = {544},
Pages = {2250-2261},
Year = {2023},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2023.2208390},
Abstract = {Reductions in natural habitats urge that we better
understand species’ interconnection and how biological
communities respond to environmental changes. However,
ecological studies of species’ interactions are limited by
their geographic and taxonomic focus which can distort our
understanding of interaction dynamics. We focus on
bird–plant interactions that refer to situations of
potential fruit consumption and seed dispersal. We develop
an approach for predicting species’ interactions that
accounts for errors in the recorded interaction networks,
addresses the geographic and taxonomic biases of existing
studies, is based on latent factors to increase flexibility
and borrow information across species, incorporates
covariates in a flexible manner to inform the latent
factors, and uses a meta-analysis dataset from 85 individual
studies. We focus on interactions among 232 birds and 511
plants in the Atlantic Forest, and identify 5% of pairs of
species with an unrecorded interaction, but posterior
probability that the interaction is possible over 80%.
Finally, we develop a permutation-based variable importance
procedure for latent factor network models and identify that
a bird’s body mass and a plant’s fruit diameter are
important in driving the presence of species interactions,
with a multiplicative relationship that exhibits both a
thresholding and a matching behavior. Supplementary
materials for this article are available
online.},
Doi = {10.1080/01621459.2023.2208390},
Key = {fds371511}
}
@article{fds372788,
Author = {Barrientos, AF and Sen, D and Page, GL and Dunson,
DB},
Title = {Bayesian Inferences on Uncertain Ranks and Orderings:
Application to Ranking Players and Lineups},
Journal = {Bayesian Analysis},
Volume = {18},
Number = {3},
Pages = {777-806},
Year = {2023},
Month = {January},
url = {http://dx.doi.org/10.1214/22-BA1324},
Abstract = {It is common to be interested in rankings or order
relationships among entities. In complex settings where one
does not directly measure a univariate statistic upon which
to base ranks, such inferences typically rely on statistical
models having entity-specific parameters. These can be
treated as random effects in hierarchical models
characterizing variation among the entities. In this paper,
we are particularly interested in the problem of ranking
basketball players in terms of their contribution to team
performance. Using data from the National Basketball
Association (NBA) in the United States, we find that many
players have similar latent ability levels, making any
single estimated ranking highly misleading. The current
literature fails to provide summaries of order relationships
that adequately account for uncertainty. Motivated by this,
we propose a Bayesian strategy for characterizing
uncertainty in inferences on order relationships among
players and lineups. Our approach adapts to scenarios in
which uncertainty in ordering is high by producing more
conservative results that improve interpretability. This is
achieved through a reward function within a decision
theoretic framework. We apply our approach to data from the
2009–2010 NBA season.},
Doi = {10.1214/22-BA1324},
Key = {fds372788}
}
@article{fds372789,
Author = {Sachs, M and Sen, D and Lu, J and Dunson, D},
Title = {Posterior Computation with the Gibbs Zig-Zag
Sampler},
Journal = {Bayesian Analysis},
Volume = {18},
Number = {3},
Pages = {909-927},
Year = {2023},
Month = {January},
url = {http://dx.doi.org/10.1214/22-BA1319},
Abstract = {An intriguing new class of piecewise deterministic Markov
processes (PDMPs) has recently been proposed as an
alternative to Markov chain Monte Carlo (MCMC). We propose a
new class of PDMPs termed Gibbs zig-zag samplers, which
allow parameters to be updated in blocks with a zig-zag
sampler applied to certain parameters and traditional
MCMC-style updates to others. We demonstrate the flexibility
of this framework on posterior sampling for logistic models
with shrinkage priors for high-dimensional regression and
random effects, and provide conditions for geometric
ergodicity and the validity of a central limit
theorem.},
Doi = {10.1214/22-BA1319},
Key = {fds372789}
}
@article{fds373933,
Author = {Li, D and Nguyen, P and Zhang, Z and Dunson, D},
Title = {Tree representations of brain structural connectivity via
persistent homology.},
Journal = {Frontiers in neuroscience},
Volume = {17},
Pages = {1200373},
Year = {2023},
Month = {January},
url = {http://dx.doi.org/10.3389/fnins.2023.1200373},
Abstract = {The brain structural connectome is generated by a collection
of white matter fiber bundles constructed from diffusion
weighted MRI (dMRI), acting as highways for neural activity.
There has been abundant interest in studying how the
structural connectome varies across individuals in relation
to their traits, ranging from age and gender to
neuropsychiatric outcomes. After applying tractography to
dMRI to get white matter fiber bundles, a key question is
how to represent the brain connectome to facilitate
statistical analyses relating connectomes to traits. The
current standard divides the brain into regions of interest
(ROIs), and then relies on an <i>adjacency matrix</i> (AM)
representation. Each cell in the AM is a measure of
connectivity, e.g., number of fiber curves, between a pair
of ROIs. Although the AM representation is intuitive, a
disadvantage is the high-dimensionality due to the large
number of cells in the matrix. This article proposes a
simpler tree representation of the brain connectome, which
is motivated by ideas in computational topology and takes
topological and biological information on the cortical
surface into consideration. We demonstrate that our tree
representation preserves useful information and
interpretability, while reducing dimensionality to improve
statistical and computational efficiency. Applications to
data from the Human Connectome Project (HCP) are considered
and code is provided for reproducing our
analyses.},
Doi = {10.3389/fnins.2023.1200373},
Key = {fds373933}
}
@article{fds374277,
Author = {Chakraborty, A and Ou, R and Dunson, DB},
Title = {Bayesian Inference on High-Dimensional Multivariate Binary
Responses},
Journal = {Journal of the American Statistical Association},
Year = {2023},
Month = {January},
url = {http://dx.doi.org/10.1080/01621459.2023.2260053},
Abstract = {It has become increasingly common to collect
high-dimensional binary response data; for example, with the
emergence of new sampling techniques in ecology. In smaller
dimensions, multivariate probit (MVP) models are routinely
used for inferences. However, algorithms for fitting such
models face issues in scaling up to high dimensions due to
the intractability of the likelihood, involving an integral
over a multivariate normal distribution having no analytic
form. Although a variety of algorithms have been proposed to
approximate this intractable integral, these approaches are
difficult to implement and/or inaccurate in high dimensions.
Our main focus is in accommodating high-dimensional binary
response data with a small-to-moderate number of covariates.
We propose a two-stage approach for inference on model
parameters while taking care of uncertainty propagation
between the stages. We use the special structure of latent
Gaussian models to reduce the highly expensive computation
involved in joint parameter estimation to focus inference on
marginal distributions of model parameters. This essentially
makes the method embarrassingly parallel for both stages. We
illustrate performance in simulations and applications to
joint species distribution modeling in ecology.
Supplementary materials for this article are available
online.},
Doi = {10.1080/01621459.2023.2260053},
Key = {fds374277}
}
@article{fds368122,
Author = {Zito, A and Rigon, T and Dunson, DB},
Title = {Inferring taxonomic placement from DNA barcoding aiding in
discovery of new taxa},
Journal = {Methods in Ecology and Evolution},
Volume = {14},
Number = {2},
Pages = {529-542},
Year = {2023},
Month = {February},
url = {http://dx.doi.org/10.1111/2041-210X.14009},
Abstract = {Predicting the taxonomic affiliation of DNA sequences
collected from biological samples is a fundamental step in
biodiversity assessment. This task is performed by
leveraging existing databases containing reference DNA
sequences endowed with a taxonomic identification. However,
environmental sequences can be from organisms that are
either unknown to science or for which there are no
reference sequences available. Thus, taxonomic novelty of a
sequence needs to be accounted for when doing
classification. We propose Bayesian nonparametric taxonomic
classifiers, BayesANT, which use species sampling model
priors to allow unobserved taxa to be discovered at each
taxonomic rank. Using a simple product multinomial
likelihood with conjugate Dirichlet priors at the lowest
rank, a highly flexible supervised algorithm is developed to
provide a probabilistic prediction of the taxa placement of
each sequence at each rank. As an illustration, we run our
algorithm on a carefully annotated library of Finnish
arthropods (FinBOL). To assess the ability of BayesANT to
recognize novelty and to predict known taxonomic
affiliations correctly, we test it on two training-test
splitting scenarios, each with a different proportion of
taxa unobserved in training. We show how our algorithm
attains accurate predictions and reliably quantifies
classification uncertainty, especially when many sequences
in the test set are affiliated to taxa unknown in training.
By enabling taxonomic predictions for DNA barcodes to
identify unseen branches, we believe BayesANT will be of
broad utility as a tool for DNA metabarcoding within
bioinformatics pipelines.},
Doi = {10.1111/2041-210X.14009},
Key = {fds368122}
}
@article{fds371510,
Author = {Gu, Y and Dunson, DB},
Title = {Bayesian Pyramids: identifiable multilayer discrete latent
structure models for discrete data},
Journal = {Journal of the Royal Statistical Society. Series B:
Statistical Methodology},
Volume = {85},
Number = {2},
Pages = {399-426},
Year = {2023},
Month = {April},
url = {http://dx.doi.org/10.1093/jrsssb/qkad010},
Abstract = {High-dimensional categorical data are routinely collected in
biomedical and social sciences. It is of great importance to
build interpretable parsimonious models that perform
dimension reduction and uncover meaningful latent structures
from such discrete data. Identifiability is a fundamental
requirement for valid modeling and inference in such
scenarios, yet is challenging to address when there are
complex latent structures. In this article, we propose a
class of identifiable multilayer (potentially deep) discrete
latent structure models for discrete data, termed Bayesian
Pyramids. We establish the identifiability of Bayesian
Pyramids by developing novel transparent conditions on the
pyramid-shaped deep latent directed graph. The proposed
identifiability conditions can ensure Bayesian posterior
consistency under suitable priors. As an illustration, we
consider the two-latent-layer model and propose a Bayesian
shrinkage estimation approach. Simulation results for this
model corroborate the identifiability and estimatability of
model parameters. Applications of the methodology to DNA
nucleotide sequence data uncover useful discrete latent
features that are highly predictive of sequence types. The
proposed framework provides a recipe for interpretable
unsupervised learning of discrete data and can be a useful
alternative to popular machine learning methods.},
Doi = {10.1093/jrsssb/qkad010},
Key = {fds371510}
}
@article{fds370898,
Author = {Mahzarnia, A and Stout, JA and Anderson, RJ and Moon, HS and Yar Han and Z and Beck, K and Browndyke, JN and Dunson, DB and Johnson, KG and O'Brien,
RJ and Badea, A},
Title = {Identifying vulnerable brain networks associated with
Alzheimer's disease risk.},
Journal = {Cereb Cortex},
Volume = {33},
Number = {9},
Pages = {5307-5322},
Year = {2023},
Month = {April},
url = {http://dx.doi.org/10.1093/cercor/bhac419},
Abstract = {The selective vulnerability of brain networks in individuals
at risk for Alzheimer's disease (AD) may help differentiate
pathological from normal aging at asymptomatic stages,
allowing the implementation of more effective interventions.
We used a sample of 72 people across the age span, enriched
for the APOE4 genotype to reveal vulnerable networks
associated with a composite AD risk factor including age,
genotype, and sex. Sparse canonical correlation analysis
(CCA) revealed a high weight associated with genotype, and
subgraphs involving the cuneus, temporal, cingulate
cortices, and cerebellum. Adding cognitive metrics to the
risk factor revealed the highest cumulative degree of
connectivity for the pericalcarine cortex, insula, banks of
the superior sulcus, and the cerebellum. To enable scaling
up our approach, we extended tensor network principal
component analysis, introducing CCA components. We developed
sparse regression predictive models with errors of 17% for
genotype, 24% for family risk factor for AD, and 5 years for
age. Age prediction in groups including cognitively impaired
subjects revealed regions not found using only normal
subjects, i.e. middle and transverse temporal, paracentral
and superior banks of temporal sulcus, as well as the
amygdala and parahippocampal gyrus. These modeling
approaches represent stepping stones towards single subject
prediction.},
Doi = {10.1093/cercor/bhac419},
Key = {fds370898}
}
@article{fds375282,
Author = {Jin, B and Dunson, DB and Rager, JE and Reif, DM and Engel, SM and Herring,
AH},
Title = {Bayesian matrix completion for hypothesis
testing.},
Journal = {Journal of the Royal Statistical Society. Series C, Applied
statistics},
Volume = {72},
Number = {2},
Pages = {254-270},
Year = {2023},
Month = {May},
url = {http://dx.doi.org/10.1093/jrsssc/qlac005},
Abstract = {We aim to infer bioactivity of each chemical by assay
endpoint combination, addressing sparsity of toxicology
data. We propose a Bayesian hierarchical framework which
borrows information across different chemicals and assay
endpoints, facilitates out-of-sample prediction of activity
for chemicals not yet assayed, quantifies uncertainty of
predicted activity, and adjusts for multiplicity in
hypothesis testing. Furthermore, this paper makes a novel
attempt in toxicology to simultaneously model
heteroscedastic errors and a nonparametric mean function,
leading to a broader definition of activity whose need has
been suggested by toxicologists. Real application identifies
chemicals most likely active for neurodevelopmental
disorders and obesity.},
Doi = {10.1093/jrsssc/qlac005},
Key = {fds375282}
}
@article{fds371472,
Author = {Liu, R and Li, M and Dunson, DB},
Title = {PPA: Principal parcellation analysis for brain connectomes
and multiple traits.},
Journal = {NeuroImage},
Volume = {276},
Pages = {120214},
Year = {2023},
Month = {August},
url = {http://dx.doi.org/10.1016/j.neuroimage.2023.120214},
Abstract = {Our understanding of the structure of the brain and its
relationships with human traits is largely determined by how
we represent the structural connectome. Standard practice
divides the brain into regions of interest (ROIs) and
represents the connectome as an adjacency matrix having
cells measuring connectivity between pairs of ROIs.
Statistical analyses are then heavily driven by the (largely
arbitrary) choice of ROIs. In this article, we propose a
human trait prediction framework utilizing a
tractography-based representation of the brain connectome,
which clusters fiber endpoints to define a data-driven white
matter parcellation targeted to explain variation among
individuals and predict human traits. This leads to
Principal Parcellation Analysis (PPA), representing
individual brain connectomes by compositional vectors
building on a basis system of fiber bundles that captures
the connectivity at the population level. PPA eliminates the
need to choose atlases and ROIs a priori, and provides a
simpler, vector-valued representation that facilitates
easier statistical analysis compared to the complex graph
structures encountered in classical connectome analyses. We
illustrate the proposed approach through applications to
data from the Human Connectome Project (HCP) and show that
PPA connectomes improve power in predicting human traits
over state-of-the-art methods based on classical
connectomes, while dramatically improving parsimony and
maintaining interpretability. Our PPA package is publicly
available on GitHub, and can be implemented routinely for
diffusion image data.},
Doi = {10.1016/j.neuroimage.2023.120214},
Key = {fds371472}
}
@article{fds376096,
Author = {Talbot, A and Dunson, D and Dzirasa, K and Carlson,
D},
Title = {Estimating a brain network predictive of stress and genotype
with supervised autoencoders.},
Journal = {J R Stat Soc Ser C Appl Stat},
Volume = {72},
Number = {4},
Pages = {912-936},
Year = {2023},
Month = {August},
url = {http://dx.doi.org/10.1093/jrsssc/qlad035},
Abstract = {Targeted brain stimulation has the potential to treat mental
illnesses. We develop an approach to help design protocols
by identifying relevant multi-region electrical dynamics.
Our approach models these dynamics as a superposition of
latent networks, where the latent variables predict a
relevant outcome. We use supervised autoencoders (SAEs) to
improve predictive performance in this context, describe the
conditions where SAEs improve predictions, and provide
modelling constraints to ensure biological relevance. We
experimentally validate our approach by finding a network
associated with stress that aligns with a previous
stimulation protocol and characterizing a genotype
associated with bipolar disorder.},
Doi = {10.1093/jrsssc/qlad035},
Key = {fds376096}
}
@article{fds370378,
Author = {Xu, J and Li, Y and Yang, H and Dunson, D and Daubechies,
I},
Title = {PiPs: A kernel-based optimization scheme for analyzing
non-stationary 1D signals},
Journal = {Applied and Computational Harmonic Analysis},
Volume = {66},
Pages = {1-17},
Year = {2023},
Month = {September},
url = {http://dx.doi.org/10.1016/j.acha.2023.04.002},
Abstract = {This paper proposes a novel kernel-based optimization scheme
to handle tasks in the analysis, e.g., signal spectral
estimation and single-channel source separation of 1D
non-stationary oscillatory data. The key insight of our
optimization scheme for reconstructing the time-frequency
information is that when a nonparametric regression is
applied on some input values, the output regressed points
would lie near the oscillatory pattern of the oscillatory 1D
signal only if these input values are a good approximation
of the ground-truth phase function. In this work, Gaussian
Process (GP) is chosen to conduct this nonparametric
regression: the oscillatory pattern is encoded as the
Pattern-inducing Points (PiPs) which act as the training
data points in the GP regression; while the targeted phase
function is fed in to compute the correlation kernels,
acting as the testing input. Better approximated phase
function generates more precise kernels, thus resulting in
smaller optimization loss error when comparing the
kernel-based regression output with the original signals. To
the best of our knowledge, this is the first algorithm that
can satisfactorily handle fully non-stationary oscillatory
data, close and crossover frequencies, and general
oscillatory patterns. Even in the example of a signal
produced by slow variation in the parameters of a
trigonometric expansion, we show that PiPs admits
competitive or better performance in terms of accuracy and
robustness than existing state-of-the-art
algorithms.},
Doi = {10.1016/j.acha.2023.04.002},
Key = {fds370378}
}
@article{fds372678,
Author = {Rigon, T and Herring, AH and Dunson, DB},
Title = {A generalized Bayes framework for probabilistic
clustering},
Journal = {Biometrika},
Volume = {110},
Number = {3},
Pages = {559-578},
Year = {2023},
Month = {September},
url = {http://dx.doi.org/10.1093/biomet/asad004},
Abstract = {Loss-based clustering methods, such as k-means clustering
and its variants, are standard tools for finding groups in
data. However, the lack of quantification of uncertainty in
the estimated clusters is a disadvantage. Model-based
clustering based on mixture models provides an alternative
approach, but such methods face computational problems and
are highly sensitive to the choice of kernel. In this
article we propose a generalized Bayes framework that
bridges between these paradigms through the use of Gibbs
posteriors. In conducting Bayesian updating, the
loglikelihood is replaced by a loss function for clustering,
leading to a rich family of clustering methods. The Gibbs
posterior represents a coherent updating of Bayesian beliefs
without needing to specify a likelihood for the data, and
can be used for characterizing uncertainty in clustering. We
consider losses based on Bregman divergence and pairwise
similarities, and develop efficient deterministic algorithms
for point estimation along with sampling algorithms for
uncertainty quantification. Several existing clustering
algorithms, including k-means, can be interpreted as
generalized Bayes estimators in our framework, and thus we
provide a method of uncertainty quantification for these
approaches, allowing, for example, calculation of the
probability that a data point is well clustered.},
Doi = {10.1093/biomet/asad004},
Key = {fds372678}
}
@article{fds371873,
Author = {Buch, DA and Johndrow, JE and Dunson, DB},
Title = {Explaining transmission rate variations and forecasting
epidemic spread in multiple regions with a semiparametric
mixed effects SIR model.},
Journal = {Biometrics},
Volume = {79},
Number = {4},
Pages = {2987-2997},
Year = {2023},
Month = {December},
url = {http://dx.doi.org/10.1111/biom.13901},
Abstract = {The transmission rate is a central parameter in mathematical
models of infectious disease. Its pivotal role in outbreak
dynamics makes estimating the current transmission rate and
uncovering its dependence on relevant covariates a core
challenge in epidemiological research as well as public
health policy evaluation. Here, we develop a method for
flexibly inferring a time-varying transmission rate
parameter, modeled as a function of covariates and a smooth
Gaussian process (GP). The transmission rate model is
further embedded in a hierarchy to allow information
borrowing across parallel streams of regional incidence
data. Crucially, the method makes use of optional
vaccination data as a first step toward modeling of endemic
infectious diseases. Computational techniques borrowed from
the Bayesian spatial analysis literature enable fast and
reliable posterior computation. Simulation studies reveal
that the method recovers true covariate effects at nominal
coverage levels. We analyze data from the COVID-19 pandemic
and validate forecast intervals on held-out data.
User-friendly software is provided to enable practitioners
to easily deploy the method in public health research.},
Doi = {10.1111/biom.13901},
Key = {fds371873}
}
@article{fds374408,
Author = {Melikechi, O and Dunson, DB},
Title = {Ellipsoid fitting with the Cayley transform.},
Journal = {IEEE transactions on signal processing : a publication of
the IEEE Signal Processing Society},
Volume = {72},
Pages = {70-83},
Year = {2024},
Month = {January},
url = {http://dx.doi.org/10.1109/tsp.2023.3332560},
Abstract = {We introduce Cayley transform ellipsoid fitting (CTEF), an
algorithm that uses the Cayley transform to fit ellipsoids
to noisy data in any dimension. Unlike many ellipsoid
fitting methods, CTEF is ellipsoid specific, meaning it
always returns elliptic solutions, and can fit arbitrary
ellipsoids. It also significantly outperforms other fitting
methods when data are not uniformly distributed over the
surface of an ellipsoid. Inspired by growing calls for
interpretable and reproducible methods in machine learning,
we apply CTEF to dimension reduction, data visualization,
and clustering in the context of cell cycle and circadian
rhythm data and several classical toy examples. Since CTEF
captures global curvature, it extracts nonlinear features in
data that other machine learning methods fail to identify.
For example, on the clustering examples CTEF outperforms 10
popular algorithms.},
Doi = {10.1109/tsp.2023.3332560},
Key = {fds374408}
}
@article{fds376095,
Author = {Datta, J and Banerjee, S and Dunson, DB},
Title = {Nonparametric Bayes multiresolution testing for
high-dimensional rare events},
Journal = {Journal of Nonparametric Statistics},
Year = {2024},
Month = {January},
url = {http://dx.doi.org/10.1080/10485252.2024.2309978},
Abstract = {In a variety of application areas, there is interest in
assessing evidence of differences in the intensity of event
realizations between groups. For example, in cancer genomic
studies collecting data on rare variants, the focus is on
assessing whether and how the variant profile changes with
the disease subtype. Motivated by this application, we
develop multiresolution nonparametric Bayes tests for
differential mutation rates across groups. The
multiresolution approach yields fast and accurate detection
of spatial clusters of rare variants, and our nonparametric
Bayes framework provides great flexibility for modelling the
intensities of rare variants. Some theoretical properties
are also assessed, including weak consistency of our
Dirichlet Process-Poisson-Gamma mixture over multiple
resolutions. Simulation studies illustrate excellent small
sample properties relative to competitors, and we apply the
method to detect rare variants related to common variable
immunodeficiency from whole exome sequencing data on 215
patients and over 60,027 control subjects.},
Doi = {10.1080/10485252.2024.2309978},
Key = {fds376095}
}
@article{fds376911,
Author = {Winter, S and Campbell, T and Lin, L and Srivastava, S and Dunson,
DB},
Title = {Emerging Directions in Bayesian Computation},
Journal = {Statistical Science},
Volume = {39},
Number = {1},
Pages = {62-89},
Year = {2024},
Month = {January},
url = {http://dx.doi.org/10.1214/23-STS919},
Abstract = {Bayesian models are powerful tools for studying complex
data, allowing the analyst to encode rich hierarchical
dependencies and leverage prior information. Most
importantly, they facilitate a complete characterization of
uncertainty through the posterior distribution. Practical
posterior computation is commonly performed via MCMC, which
can be computationally infeasible for high-dimensional
models with many observations. In this article, we discuss
the potential to improve posterior computation using ideas
from machine learning. Concrete directions are explored in
vignettes on normalizing flows, statistical properties of
variational approximations, Bayesian coresets and
distributed Bayesian inference.},
Doi = {10.1214/23-STS919},
Key = {fds376911}
}
@article{fds376258,
Author = {Huang, J and Morsomme, R and Dunson, D and Xu, J},
Title = {Detecting changes in the transmission rate of a stochastic
epidemic model.},
Journal = {Statistics in medicine},
Year = {2024},
Month = {February},
url = {http://dx.doi.org/10.1002/sim.10050},
Abstract = {Throughout the course of an epidemic, the rate at which
disease spreads varies with behavioral changes, the
emergence of new disease variants, and the introduction of
mitigation policies. Estimating such changes in transmission
rates can help us better model and predict the dynamics of
an epidemic, and provide insight into the efficacy of
control and intervention strategies. We present a method for
likelihood-based estimation of parameters in the stochastic
susceptible-infected-removed model under a
time-inhomogeneous transmission rate comprised of piecewise
constant components. In doing so, our method simultaneously
learns change points in the transmission rate via a Markov
chain Monte Carlo algorithm. The method targets the exact
model posterior in a difficult missing data setting given
only partially observed case counts over time. We validate
performance on simulated data before applying our approach
to data from an Ebola outbreak in Western Africa and
COVID-19 outbreak on a university campus.},
Doi = {10.1002/sim.10050},
Key = {fds376258}
}
%% Papers Submitted
@article{fds70573,
Author = {B. Cai and D.B. Dunson},
Title = {Variable selection in nonparametric random effects
models},
Journal = {submitted},
Year = {2007},
Key = {fds70573}
}
@article{fds70581,
Author = {L.Wang and D.B. Dunson},
Title = {Bayesian isotonic density regression},
Year = {2007},
Key = {fds70581}
}
@article{fds151355,
Author = {R. Mitra and D.B. Dunson},
Title = {Two level stochastic search variable selection in GLMs with
missing predictors},
Year = {2008},
Key = {fds151355}
}
%% Chapters
@misc{fds340365,
Author = {Weinberg, CR and Dunson, DB},
Title = {Some issues in assessing human fertility},
Pages = {42-49},
Booktitle = {Statistics in the 21st Century},
Year = {2001},
Month = {January},
ISBN = {9781584882725},
Abstract = {One of the pleasures of working as an applied statistician
is the awareness it brings of the wide diversity of
scientific fields to which our profession contributes
critical concepts and methods. My own awareness was enhanced
by accepting the invitation from the editors of JASA to
serve as guest editor for this section of vignettes
celebrating the significant contributions made by
statisticians to the life and medical sciences in the 20th
century. The goal of the project was not an encyclopedic
catalog of all the major developments, but rather a sampling
of some of the most interesting work. Of the 12 vignettes,
10 focus on particular areas of application: environmetrics,
wildlife populations, animal breeding, human fertility,
toxicology, medical diagnosis, clinical trials,
environmental epidemiology, statistical genetics, and
molecular biology. The two vignettes that begin the series
focus more on methods that have had, or promise to have,
impact across a range of subject matter areas: survival
analysis and causal analysis.},
Key = {fds340365}
}
@misc{fds257825,
Author = {Dunson, DB and Bhattacharya, A and Griffin, JE},
Title = {Nonparametric Bayes Regression and Classification Through
Mixtures of Product Kernels},
Volume = {9780199694587},
Pages = {145-164},
Booktitle = {Bayesian Statistics 9},
Publisher = {Oxford University Press},
Year = {2012},
Month = {January},
ISBN = {9780199694587},
url = {http://dx.doi.org/10.1093/acprof:oso/9780199694587.003.0005},
Abstract = {It is routine in many fields to collect data having a
variety of measurement scales and supports. For example, in
biomedical studies for each patient one may collect
functional data on a biomarker over time, gene expression
values normalized to lie on a hypersphere to remove
artifacts, clinical and demographic covariates and a health
outcome. A common interest focuses on building predictive
models, with parametric assumptions seldom supported by
prior knowledge. Hence, it is most appropriate to define a
prior with large support allowing the conditional
distribution of the response given predictors to be unknown
and changing flexibly across the predictor space not just in
the mean but also in the variance and shape. Building on
earlier work on Dirichlet process mixtures, we describe a
simple and general strategy for inducing models for
conditional distributions through discrete mixtures of
product kernel models for joint distributions of predictors
and response variables. Computation is straightforward and
the approach can easily accommodate combining of widely
disparate data types, including vector data in a Euclidean
space, categorical observations, functions, images and
manifold data.},
Doi = {10.1093/acprof:oso/9780199694587.003.0005},
Key = {fds257825}
}
@misc{fds365019,
Author = {Dunson, DB},
Title = {Nonparametric Bayes},
Pages = {281-291},
Booktitle = {Past, Present, and Future of Statistical
Science},
Year = {2014},
Month = {January},
ISBN = {9781482204964},
Abstract = {I reflect on the past, present, and future of nonparametric
Bayesian statistics. Current nonparametric Bayes research
tends to be split between theoretical studies, seeking to
understand relatively simple models, and machine learning,
defining new models and computational algorithms motivated
by practical performance. I comment on the current
landscape, open problems and promising future directions in
modern big data applications.},
Key = {fds365019}
}
|