Department of Mathematics
 Search | Help | Login | pdf version | printable version

Math @ Duke



Publications of Sayan Mukherjee    :chronological  alphabetical  combined listing:

%% Books   
   Author = {Mukherjee, SP and Sinha, BK and Chattopadhyay,
   Title = {Statistical methods in social science research},
   Pages = {1-152},
   Year = {2018},
   Month = {October},
   ISBN = {9789811321450},
   url = {},
   Abstract = {© Springer Nature Singapore Pte Ltd. 2018. All rights
             reserved. This book presents various recently developed and
             traditional statistical techniques, which are increasingly
             being applied in social science research. The social
             sciences cover diverse phenomena arising in society, the
             economy and the environment, some of which are too complex
             to allow concrete statements; some cannot be defined by
             direct observations or measurements; some are culture- (or
             region-) specific, while others are generic and common.
             Statistics, being a scientific method � as distinct from a
             �science� related to any one type of phenomena � is
             used to make inductive inferences regarding various
             phenomena. The book addresses both qualitative and
             quantitative research (a combination of which is essential
             in social science research) and offers valuable
             supplementary reading at an advanced level for
   Doi = {10.1007/9789811321467},
   Key = {fds346878}

%% Papers Published   
   Author = {Berchuck, SI and Mukherjee, S and Medeiros, FA},
   Title = {Estimating Rates of Progression and Predicting Future Visual
             Fields in Glaucoma Using a Deep Variational
   Journal = {Scientific Reports},
   Volume = {9},
   Number = {1},
   Pages = {18113},
   Year = {2019},
   Month = {December},
   url = {},
   Abstract = {In this manuscript we develop a deep learning algorithm to
             improve estimation of rates of progression and prediction of
             future patterns of visual field loss in glaucoma. A
             generalized variational auto-encoder (VAE) was trained to
             learn a low-dimensional representation of standard automated
             perimetry (SAP) visual fields using 29,161 fields from 3,832
             patients. The VAE was trained on a 90% sample of the data,
             with randomization at the patient level. Using the remaining
             10%, rates of progression and predictions were generated,
             with comparisons to SAP mean deviation (MD) rates and
             point-wise (PW) regression predictions, respectively. The
             longitudinal rate of change through the VAE latent space
             (e.g., with eight dimensions) detected a significantly
             higher proportion of progression than MD at two (25% vs. 9%)
             and four (35% vs 15%) years from baseline. Early on, VAE
             improved prediction over PW, with significantly smaller mean
             absolute error in predicting the 4th, 6th and 8th visits
             from the first three (e.g., visit eight: VAE8: 5.14 dB vs.
             PW: 8.07 dB; P < 0.001). A deep VAE can be used for
             assessing both rates and trajectories of progression in
             glaucoma, with the additional benefit of being a generative
             technique capable of predicting future patterns of visual
             field damage.},
   Doi = {10.1038/s41598-019-54653-6},
   Key = {fds347355}

   Author = {Cakir, M and Mukherjee, S and Wood, KC},
   Title = {Label propagation defines signaling networks associated with
             recurrently mutated cancer genes.},
   Journal = {Scientific Reports},
   Volume = {9},
   Number = {1},
   Pages = {9401},
   Year = {2019},
   Month = {June},
   url = {},
   Abstract = {Human tumors have distinct profiles of genomic alterations,
             and each of these alterations has the potential to cause
             unique changes to cellular homeostasis. Detailed analyses of
             these changes could reveal downstream effects of genomic
             alterations, contributing to our understanding of their
             roles in tumor development and progression. Across a range
             of tumor types, including bladder, lung, and endometrial
             carcinoma, we determined genes that are frequently altered
             in The Cancer Genome Atlas patient populations, then
             examined the effects of these alterations on signaling and
             regulatory pathways. To achieve this, we used a label
             propagation-based methodology to generate networks from gene
             expression signatures associated with defined mutations.
             Individual networks offered a large-scale view of signaling
             changes represented by gene signatures, which in turn
             reflected the scope of molecular events that are perturbed
             in the presence of a given genomic alteration. Comparing
             different networks to one another revealed common biological
             pathways impacted by distinct genomic alterations,
             highlighting the concept that tumors can dysregulate key
             pathways through multiple, seemingly unrelated mechanisms.
             Finally, altered genes inducing common changes to the
             signaling network were used to search for genomic markers of
             drug response, connecting shared perturbations to
             differential drug sensitivity.},
   Doi = {10.1038/s41598-019-45603-3},
   Key = {fds344778}

   Author = {Gao, T and Brodzki, J and Mukherjee, S},
   Title = {The Geometry of Synchronization Problems and Learning Group
   Journal = {Discrete & Computational Geometry},
   Year = {2019},
   Month = {January},
   url = {},
   Abstract = {© 2019, Springer Science+Business Media, LLC, part of
             Springer Nature. We develop a geometric framework, based on
             the classical theory of fibre bundles, to characterize the
             cohomological nature of a large class of
             synchronization-type problems in the context of graph
             inference and combinatorial optimization. We identify each
             synchronization problem in topological group G on connected
             graph Γ with a flat principal G-bundle over Γ , thus
             establishing a classification result for synchronization
             problems using the representation variety of the fundamental
             group of Γ into G. We then develop a twisted Hodge theory
             on flat vector bundles associated with these flat principal
             G-bundles, and provide a geometric realization of the graph
             connection Laplacian as the lowest-degree Hodge Laplacian in
             the twisted de Rham–Hodge cochain complex. Motivated by
             these geometric intuitions, we propose to study the problem
             of learning group actions—partitioning a collection of
             objects based on the local synchronizability of pairwise
             correspondence relations—and provide a heuristic
             synchronization-based algorithm for solving this type of
             problems. We demonstrate the efficacy of this algorithm on
             simulated and real datasets.},
   Doi = {10.1007/s00454-019-00100-2},
   Key = {fds343606}

   Author = {Washburne, AD and Silverman, JD and Morton, JT and Becker, DJ and Crowley, D and Mukherjee, S and David, LA and Plowright,
   Title = {Phylofactorization: a graph partitioning algorithm to
             identify phylogenetic scales of ecological
   Journal = {Ecological Monographs},
   Year = {2019},
   Month = {January},
   url = {},
   Abstract = {© 2019 by the Ecological Society of America The problem of
             pattern and scale is a central challenge in ecology. In
             community ecology, an important scale is that at which we
             aggregate species to define our units of study, such as
             aggregation of “nitrogen fixing trees” to understand
             patterns in carbon sequestration. With the emergence of
             massive community ecological data sets, there is a need to
             objectively identify the scales for aggregating species to
             capture well-defined patterns in community ecological data.
             The phylogeny is a scaffold for identifying scales of
             species-aggregation associated with macroscopic patterns.
             Phylofactorization was developed to identify phylogenetic
             scales underlying patterns in relative abundance data, but
             many ecological data, such as presence-absences and counts,
             are not relative abundances yet may still have phylogenetic
             scales capturing patterns of interest. Here, we broaden
             phylofactorization to a graph-partitioning algorithm
             identifying phylogenetic scales in community ecological
             data. As a graph-partitioning algorithm, phylofactorization
             connects many tools from data analysis to phylogenetically
             informed analyses of community ecological data. Two-sample
             tests identify five phylogenetic factors of mammalian body
             mass which arose during the K-Pg extinction event,
             consistent with other analyses of mammalian body mass
             evolution. Projection of data onto coordinates connecting
             the phylogeny and graph-partitioning algorithm yield a
             phylogenetic principal components analysis which refines our
             understanding of the major sources of variation in the human
             gut microbiome. These same coordinates allow generalized
             additive modeling of microbes in Central Park soils,
             confirming that a large clade of Acidobacteria thrive in
             neutral soils. The graph-partitioning algorithm extends to
             generalized linear and additive modeling of exponential
             family random variables by phylogenetically constrained
             reduced-rank regression or stepwise factor contrasts. All of
             these tools can be implemented with the R package
   Doi = {10.1002/ecm.1353},
   Key = {fds341907}

   Author = {Crawford, L and Monod, A and Chen, AX and Mukherjee, S and Rabadán,
   Title = {Predicting Clinical Outcomes in Glioblastoma: An Application
             of Topological and Functional Data Analysis},
   Journal = {Journal of the American Statistical Association},
   Year = {2019},
   Month = {January},
   url = {},
   Abstract = {© 2019, © 2019 American Statistical Association.
             Glioblastoma multiforme (GBM) is an aggressive form of human
             brain cancer that is under active study in the field of
             cancer biology. Its rapid progression and the relative time
             cost of obtaining molecular data make other readily
             available forms of data, such as images, an important
             resource for actionable measures in patients. Our goal is to
             use information given by medical images taken from GBM
             patients in statistical settings. To do this, we design a
             novel statistic—the smooth Euler characteristic transform
             (SECT)—that quantifies magnetic resonance images of
             tumors. Due to its well-defined inner product structure, the
             SECT can be used in a wider range of functional and
             nonparametric modeling approaches than other previously
             proposed topological summary statistics. When applied to a
             cohort of GBM patients, we find that the SECT is a better
             predictor of clinical outcomes than both existing tumor
             shape quantifications and common molecular assays.
             Specifically, we demonstrate that SECT features alone
             explain more of the variance in GBM patient survival than
             gene expression, volumetric features, and morphometric
             features. The main takeaways from our findings are thus
             2-fold. First, they suggest that images contain valuable
             information that can play an important role in clinical
             prognosis and other medical decisions. Second, they show
             that the SECT is a viable tool for the broader study of
             medical imaging informatics. Supplementary materials for
             this article, including a standardized description of the
             materials available for reproducing the work, are available
             as an online supplement.},
   Doi = {10.1080/01621459.2019.1671198},
   Key = {fds346877}

   Author = {Silverman, JD and Durand, HK and Bloom, RJ and Mukherjee, S and David,
   Title = {Correction to: Dynamic linear models guide design and
             analysis of microbiota studies within artificial human
   Journal = {Microbiome},
   Volume = {6},
   Number = {1},
   Pages = {212},
   Year = {2018},
   Month = {November},
   url = {},
   Abstract = {AbstractFollowing publication of the original article [1],
             the authors noticed an error in the presentation of
             equations in the PDF version.},
   Doi = {10.1186/s40168-018-0601-6},
   Key = {fds340140}

   Author = {Silverman, JD and Durand, HK and Bloom, RJ and Mukherjee, S and David,
   Title = {Dynamic linear models guide design and analysis of
             microbiota studies within artificial human
   Journal = {Microbiome},
   Volume = {6},
   Number = {1},
   Pages = {202},
   Year = {2018},
   Month = {November},
   url = {},
   Abstract = {BACKGROUND:Artificial gut models provide unique
             opportunities to study human-associated microbiota.
             Outstanding questions for these models' fundamental biology
             include the timescales on which microbiota vary and the
             factors that drive such change. Answering these questions
             though requires overcoming analytical obstacles like
             estimating the effects of technical variation on observed
             microbiota dynamics, as well as the lack of appropriate
             benchmark datasets. RESULTS:To address these obstacles, we
             created a modeling framework based on multinomial
             logistic-normal dynamic linear models (MALLARDs) and
             performed dense longitudinal sampling of four replicate
             artificial human guts over the course of 1 month. The
             resulting analyses revealed how the ratio of biological
             variation to technical variation from sample processing
             depends on sampling frequency. In particular, we find that
             at hourly sampling frequencies, 76% of observed variation
             could be ascribed to technical sources, which could also
             skew the observed covariation between taxa. We also found
             that the artificial guts demonstrated replicable
             trajectories even after a recovery from a transient feed
             disruption. Additionally, we observed irregular sub-daily
             oscillatory dynamics associated with the bacterial family
             Enterobacteriaceae within all four replicate vessels.
             CONCLUSIONS:Our analyses suggest that, beyond variation due
             to sequence counting, technical variation from sample
             processing can obscure temporal variation from biological
             sources in artificial gut studies. Our analyses also
             supported hypotheses that human gut microbiota fluctuates on
             sub-daily timescales in the absence of a host and that
             microbiota can follow replicable trajectories in the
             presence of environmental driving forces. Finally, multiple
             aspects of our approach are generalizable and could
             ultimately be used to facilitate the design and analysis of
             longitudinal microbiota studies in vivo.},
   Doi = {10.1186/s40168-018-0584-3},
   Key = {fds339843}

   Author = {Barish, S and Nuss, S and Strunilin, I and Bao, S and Mukherjee, S and Jones, CD and Volkan, PC},
   Title = {Combinations of DIPs and Dprs control organization of
             olfactory receptor neuron terminals in Drosophila.},
   Journal = {Plos Genetics},
   Volume = {14},
   Number = {8},
   Pages = {e1007560},
   Year = {2018},
   Month = {August},
   url = {},
   Abstract = {In Drosophila, 50 classes of olfactory receptor neurons
             (ORNs) connect to 50 class-specific and uniquely positioned
             glomeruli in the antennal lobe. Despite the identification
             of cell surface receptors regulating axon guidance, how ORN
             axons sort to form 50 stereotypical glomeruli remains
             unclear. Here we show that the heterophilic cell adhesion
             proteins, DIPs and Dprs, are expressed in ORNs during
             glomerular formation. Many ORN classes express a unique
             combination of DIPs/dprs, with neurons of the same class
             expressing interacting partners, suggesting a role in
             class-specific self-adhesion between ORN axons. Analysis of
             DIP/Dpr expression revealed that ORNs that target
             neighboring glomeruli have different combinations, and ORNs
             with very similar DIP/Dpr combinations can project to
             distant glomeruli in the antennal lobe. DIP/Dpr profiles are
             dynamic during development and correlate with sensilla type
             lineage for some ORN classes. Perturbations of DIP/dpr gene
             function result in local projection defects of ORN axons and
             glomerular positioning, without altering correct matching of
             ORNs with their target neurons. Our results suggest that
             context-dependent differential adhesion through DIP/Dpr
             combinations regulate self-adhesion and sort ORN axons into
             uniquely positioned glomeruli.},
   Doi = {10.1371/journal.pgen.1007560},
   Key = {fds338059}

   Author = {Tan, Z and Roche, K and Zhou, X and Mukherjee, S},
   Title = {Scalable algorithms for learning high-dimensional linear
             mixed models},
   Journal = {34th Conference on Uncertainty in Artificial Intelligence
             2018, Uai 2018},
   Volume = {1},
   Pages = {259-268},
   Year = {2018},
   Month = {January},
   ISBN = {9781510871601},
   Abstract = {© 2018 by Association For Uncertainty in Artificial
             Intelligence (AUAI) All rights reserved. Linear mixed models
             (LMMs) are used extensively to model observations that are
             not independent. Parameter estimation for LMMs can be
             computationally prohibitive on big data. State-of-the-art
             learning algorithms require computational complexity which
             depends at least linearly on the dimension p of the
             covariates, and often use heuristics that do not offer
             theoretical guarantees. We present scalable algorithms for
             learning high-dimensional LMMs with sublinear computational
             complexity dependence on p. Key to our approach are novel
             dual estimators which use only kernel functions of the data,
             and fast computational techniques based on the subsampled
             randomized Hadamard transform. We provide theoretical
             guarantees for our learning algorithms, demonstrating the
             robustness of parameter estimation. Finally, we complement
             the theory with experiments on large synthetic and real
   Key = {fds341908}

   Author = {Singleton, KR and Crawford, L and Tsui, E and Manchester, HE and Maertens, O and Liu, X and Liberti, MV and Magpusao, AN and Stein, EM and Tingley, JP and Frederick, DT and Boland, GM and Flaherty, KT and McCall, SJ and Krepler, C and Sproesser, K and Herlyn, M and Adams, DJ and Locasale, JW and Cichowski, K and Mukherjee, S and Wood,
   Title = {Melanoma Therapeutic Strategies that Select against
             Resistance by Exploiting MYC-Driven Evolutionary
   Journal = {Cell Reports},
   Volume = {21},
   Number = {10},
   Pages = {2796-2812},
   Year = {2017},
   Month = {December},
   url = {},
   Abstract = {Diverse pathways drive resistance to BRAF/MEK inhibitors in
             BRAF-mutant melanoma, suggesting that durable control of
             resistance will be a challenge. By combining statistical
             modeling of genomic data from matched pre-treatment and
             post-relapse patient tumors with functional interrogation of
             >20 in vitro and in vivo resistance models, we discovered
             that major pathways of resistance converge to activate the
             transcription factor, c-MYC (MYC). MYC expression and
             pathway gene signatures were suppressed following drug
             treatment, and then rebounded during progression.
             Critically, MYC activation was necessary and sufficient for
             resistance, and suppression of MYC activity using genetic
             approaches or BET bromodomain inhibition was sufficient to
             resensitize cells and delay BRAFi resistance. Finally,
             MYC-driven, BRAFi-resistant cells are hypersensitive to the
             inhibition of MYC synthetic lethal partners, including SRC
             family and c-KIT tyrosine kinases, as well as glucose,
             glutamine, and serine metabolic pathways. These insights
             enable the design of combination therapies that select
             against resistance evolution.},
   Doi = {10.1016/j.celrep.2017.11.022},
   Key = {fds330900}

   Author = {Darnell, G and Georgiev, S and Mukherjee, S and Engelhardt,
   Title = {Adaptive randomized dimension reduction on massive
   Journal = {Journal of machine learning research : JMLR},
   Volume = {18},
   Year = {2017},
   Month = {November},
   Abstract = {© 2017 Gregory Darnell, Stoyan Georgiev, Sayan Mukherjee,
             Barbara E Engelhardt. The scalability of statistical
             estimators is of increasing importance in modern
             applications. One approach to implementing scalable
             algorithms is to compress data into a low dimensional latent
             space using dimension reduction methods. In this paper, we
             develop an approach for dimension reduction that exploits
             the assumption of low rank structure in high dimensional
             data to gain both computational and statistical advantages.
             We adapt recent randomized low-rank approximation algorithms
             to provide an efficient solution to principal component
             analysis (PCA), and we use this efficient solver to improve
             estimation in large-scale linear mixed models (LMM) for
             association mapping in statistical genomics. A key
             observation in this paper is that randomization serves a
             dual role, improving both computational and statistical
             performance by implicitly regularizing the covariance matrix
             estimate of the random effect in an LMM. These statistical
             and computational advantages are highlighted in our
             experiments on simulated data and large-scale genomic
   Key = {fds332761}

   Author = {Gao, T and Yapuncich, GS and Daubechies, I and Mukherjee, S and Boyer,
   Title = {Development and Assessment of Fully Automated and Globally
             Transitive Geometric Morphometric Methods, With Application
             to a Biological Comparative Dataset With High Interspecific
   Journal = {The Anatomical Record : Advances in Integrative Anatomy and
             Evolutionary Biology},
   Year = {2017},
   Month = {October},
   url = {},
   Abstract = {Automated geometric morphometric methods are promising tools
             for shape analysis in comparative biology, improving
             researchers' abilities to quantify variation extensively (by
             permitting more specimens to be analyzed) and intensively
             (by characterizing shapes with greater fidelity). Although
             use of these methods has increased, published automated
             methods have some notable limitations: pairwise
             correspondences are frequently inaccurate and pairwise
             mappings are not globally consistent (i.e., they lack
             transitivity across the full sample). Here, we reassess the
             accuracy of published automated methods-cPDist (Boyer et al.
             Proc Nat Acad Sci 108 (2011) 18221-18226) and auto3Dgm
             (Boyer et al.: Anat Rec 298 (2015a) 249-276)-and evaluate
             several modifications to these methods. We show that a
             substantial percentage of alignments and pairwise maps
             between specimens of dissimilar geometries were inaccurate
             in the study of Boyer et al. (Proc Nat Acad Sci 108 (2011)
             18221-18226), despite a taxonomically partitioned variance
             structure of continuous Procrustes distances. We show these
             inaccuracies are remedied using a globally informed
             methodology within a collection of shapes, rather than
             relying on pairwise comparisons (c.f. Boyer et al.: Anat Rec
             298 (2015a) 249-276). Unfortunately, while global
             information generally enhances maps between dissimilar
             objects, it can degrade the quality of correspondences
             between similar objects due to the accumulation of numerical
             error. We explore a number of approaches to mitigate this
             degradation, quantify their performance, and compare the
             generated pairwise maps (and the shape space characterized
             by these maps) to a "ground truth" obtained from landmarks
             manually collected by geometric morphometricians. Novel
             methods both improve the quality of the pairwise
             correspondences relative to cPDist and achieve a taxonomic
             distinctiveness comparable to auto3Dgm. Anat Rec, 2017. ©
             2017 Wiley Periodicals, Inc.},
   Doi = {10.1002/ar.23700},
   Key = {fds330010}

   Author = {Crawford, L and Wood, KC and Zhou, X and Mukherjee,
   Title = {Bayesian Approximate Kernel Regression With Variable
   Journal = {Journal of the American Statistical Association},
   Pages = {1-12},
   Year = {2017},
   Month = {August},
   url = {},
   Doi = {10.1080/01621459.2017.1361830},
   Key = {fds335806}

   Author = {Bobrowski, O and Mukherjee, S and Taylor, JE},
   Title = {Topological consistency via kernel estimation},
   Journal = {Bernoulli : official journal of the Bernoulli Society for
             Mathematical Statistics and Probability},
   Volume = {23},
   Number = {1},
   Pages = {288-328},
   Year = {2017},
   Month = {February},
   url = {},
   Doi = {10.3150/15-BEJ744},
   Key = {fds323270}

   Author = {Tan, Z and Mukherjee, S},
   Title = {Partitioned tensor factorizations for learning mixed
             membership models},
   Journal = {34th International Conference on Machine Learning, Icml
   Volume = {7},
   Pages = {5156-5165},
   Year = {2017},
   Month = {January},
   ISBN = {9781510855144},
   Abstract = {Copyright © 2017 by the authors. We present an efficient
             algorithm for learning mixed membership models when the
             number of variables p is much larger than the number of
             hidden components k. This algorithm reduces the
             computational complexity of state-of-the-art tensor methods,
             which require decomposing an O (p3) tensor, to factorizing O
             (p/k) sub-tensors each of size O (k3). In addition, we
             address the issue of negative entries in the empirical
             method of moments based estimators. We provide sufficient
             conditions under which our approach has provable guarantees.
             Our approach obtains competitive empirical results on both
             simulated and real data.},
   Key = {fds335807}

   Author = {Snyder-Mackler, N and Majoros, WH and Yuan, ML and Shaver, AO and Gordon, JB and Kopp, GH and Schlebusch, SA and Wall, JD and Alberts, SC and Mukherjee, S and Zhou, X and Tung, J},
   Title = {Efficient Genome-Wide Sequencing and Low-Coverage Pedigree
             Analysis from Noninvasively Collected Samples.},
   Journal = {Genetics},
   Volume = {203},
   Number = {2},
   Pages = {699-714},
   Year = {2016},
   Month = {June},
   url = {},
   Abstract = {Research on the genetics of natural populations was
             revolutionized in the 1990s by methods for genotyping
             noninvasively collected samples. However, these methods have
             remained largely unchanged for the past 20 years and lag far
             behind the genomics era. To close this gap, here we report
             an optimized laboratory protocol for genome-wide capture of
             endogenous DNA from noninvasively collected samples, coupled
             with a novel computational approach to reconstruct pedigree
             links from the resulting low-coverage data. We validated
             both methods using fecal samples from 62 wild baboons,
             including 48 from an independently constructed extended
             pedigree. We enriched fecal-derived DNA samples up to
             40-fold for endogenous baboon DNA and reconstructed
             near-perfect pedigree relationships even with extremely
             low-coverage sequencing. We anticipate that these methods
             will be broadly applicable to the many research systems for
             which only noninvasive samples are available. The lab
             protocol and software ("WHODAD") are freely available at
   , respectively.},
   Doi = {10.1534/genetics.116.187492},
   Key = {fds322049}

   Author = {Zhao, S and Gao, C and Mukherjee, S and Engelhardt,
   Title = {Bayesian group factor analysis with structured
   Journal = {Journal of machine learning research : JMLR},
   Volume = {17},
   Pages = {1-47},
   Year = {2016},
   Month = {April},
   Abstract = {© 2016 Shiwen Zhao, Chuan Gao, Sayan Mukherjee, and Barbara
             E. Engelhardt.Latent factor models are the canonical
             statistical tool for exploratory analyses of lowdimensional
             linear structure for a matrix of p features across n
             samples. We develop a structured Bayesian group factor
             analysis model that extends the factor model to multiple
             coupled observation matrices; in the case of two
             observations, this reduces to a Bayesian model of canonical
             correlation analysis. Here, we carefully de-ne a structured
             Bayesian prior that encourages both element-wise and
             column-wise shrinkage and leads to desirable behavior on
             high-dimensional data. In particular, our model puts a
             structured prior on the joint factor loading matrix,
             regularizing at three levels, which enables element-wise
             sparsity and unsupervised recovery of latent factors
             corresponding to structured variance across arbitrary
             subsets of the observations. In addition, our structured
             prior allows for both dense and sparse latent factors so
             that covariation among either all features or only a subset
             of features can be recovered. We use fast parameter-expanded
             expectation-maximization for parameter estimation in this
             model. We validate our method on simulated data with
             substantial structure. We show results of our method applied
             to three high-dimensional data sets, comparing results
             against a number of state-of-The-Art approaches. These
             results illustrate useful properties of our model, including
             i) recovering sparse signal in the presence of dense
             effects; ii) the ability to scale naturally to large numbers
             of observations; iii) exible observation-and factor-specific
             regularization to recover factors with a wide variety of
             sparsity levels and percentage of variance explained; and
             iv) tractable inference that scales to modern genomic and
             text data sizes.},
   Key = {fds323271}

   Author = {Galinsky, KJ and Bhatia, G and Loh, P-R and Georgiev, S and Mukherjee,
             S and Patterson, NJ and Price, AL},
   Title = {Fast Principal-Component Analysis Reveals Convergent
             Evolution of ADH1B in Europe and East Asia.},
   Journal = {The American Journal of Human Genetics},
   Volume = {98},
   Number = {3},
   Pages = {456-472},
   Year = {2016},
   Month = {March},
   url = {},
   Abstract = {Searching for genetic variants with unusual differentiation
             between subpopulations is an established approach for
             identifying signals of natural selection. However, existing
             methods generally require discrete subpopulations. We
             introduce a method that infers selection using principal
             components (PCs) by identifying variants whose
             differentiation along top PCs is significantly greater than
             the null distribution of genetic drift. To enable the
             application of this method to large datasets, we developed
             the FastPCA software, which employs recent advances in
             random matrix theory to accurately approximate top PCs while
             reducing time and memory cost from quadratic to linear in
             the number of individuals, a computational improvement of
             many orders of magnitude. We apply FastPCA to a cohort of
             54,734 European Americans, identifying 5 distinct
             subpopulations spanning the top 4 PCs. Using the PC-based
             test for natural selection, we replicate previously known
             selected loci and identify three new genome-wide significant
             signals of selection, including selection in Europeans at
             ADH1B. The coding variant rs1229984(∗)T has previously
             been associated to a decreased risk of alcoholism and shown
             to be under selection in East Asians; we show that it is a
             rare example of independent evolution on two continents. We
             also detect selection signals at IGFBP3 and IGH, which have
             also previously been associated to human
   Doi = {10.1016/j.ajhg.2015.12.022},
   Key = {fds323272}

   Author = {Munch, E and Turner, K and Bendich, P and Mukherjee, S and Mattingly, J and Harer, J},
   Title = {Probabilistic Fréchet means for time varying persistence
   Journal = {Electronic Journal of Statistics},
   Volume = {9},
   Number = {1},
   Pages = {1173-1204},
   Year = {2015},
   Month = {January},
   url = { Duke open
   Abstract = {© 2015, Institute of Mathematical Statistics. All rights
             reserved.In order to use persistence diagrams as a true
             statistical tool, it would be very useful to have a good
             notion of mean and variance for a set of diagrams. In [23],
             Mileyko and his collaborators made the first study of the
             properties of the Fréchet mean in (D<inf>p</inf>,
             W<inf>p</inf>), the space of persistence diagrams equipped
             with the p-th Wasserstein metric. In particular, they showed
             that the Fréchet mean of a finite set of diagrams always
             exists, but is not necessarily unique. The means of a
             continuously-varying set of diagrams do not themselves
             (necessarily) vary continuously, which presents obvious
             problems when trying to extend the Fréchet mean definition
             to the realm of time-varying persistence diagrams, better
             known as vineyards. We fix this problem by altering the
             original definition of Fréchet mean so that it now becomes
             a probability measure on the set of persistence diagrams; in
             a nutshell, the mean of a set of diagrams will be a weighted
             sum of atomic measures, where each atom is itself a
             persistence diagram determined using a perturbation of the
             input diagrams. This definition gives for each N a map
             (D<inf>p</inf>)<sup>N</sup>→ℙ(D<inf>p</inf>). We show
             that this map is Hölder continuous on finite diagrams and
             thus can be used to build a useful statistic on
   Doi = {10.1214/15-EJS1030},
   Key = {fds258517}

   Author = {Raskutti, G and Mukherjee, S},
   Title = {The information geometry of mirror descent},
   Journal = {Lecture notes in computer science},
   Volume = {9389},
   Pages = {359-368},
   Year = {2015},
   Month = {January},
   ISBN = {9783319250397},
   url = {},
   Abstract = {© Springer International Publishing Switzerland 2015.We
             prove the equivalence of two online learning algorithms,
             mirror descent and natural gradient descent. Both mirror
             descent and natural gradient descent are generalizations of
             online gradient descent when the parameter of interest lies
             on a non-Euclidean manifold. Natural gradient descent
             selects the steepest descent direction along a Riemannian
             manifold by multiplying the standard gradient by the inverse
             of the metric tensor. Mirror descent induces non-Euclidean
             structure by solving iterative optimization problems using
             different proximity functions. In this paper, we prove that
             mirror descent induced by a Bregman divergence proximity
             functions is equivalent to the natural gradient descent
             algorithm on the Riemannian manifold in the dual coordinate
             system.We use techniques from convex analysis and
             connections between Riemannian manifolds, Bregman
             divergences and convexity to prove this result. This
             equivalence between natural gradient descent and mirror
             descent, implies that (1) mirror descent is the steepest
             descent direction along the Riemannian manifold
             corresponding to the choice of Bregman divergence and (2)
             mirror descent with log-likelihood loss applied to parameter
             estimation in exponential families asymptotically achieves
             the classical Cramér-Rao lower bound.},
   Doi = {10.1007/978-3-319-25040-3_39},
   Key = {fds323274}

   Author = {Stewart, L and MacLean, EL and Ivy, D and Woods, V and Cohen, E and Rodriguez, K and McIntyre, M and Mukherjee, S and Call, J and Kaminski,
             J and Miklósi, Á and Wrangham, RW and Hare, B},
   Title = {Citizen Science as a New Tool in Dog Cognition
   Journal = {PloS one},
   Volume = {10},
   Number = {9},
   Pages = {e0135176},
   Year = {2015},
   Month = {January},
   url = {},
   Abstract = {Family dogs and dog owners offer a potentially powerful way
             to conduct citizen science to answer questions about animal
             behavior that are difficult to answer with more conventional
             approaches. Here we evaluate the quality of the first data
             on dog cognition collected by citizen scientists using the
    website. We conducted analyses to understand
             if data generated by over 500 citizen scientists replicates
             internally and in comparison to previously published
             findings. Half of participants participated for free while
             the other half paid for access. The website provided each
             participant a temperament questionnaire and instructions on
             how to conduct a series of ten cognitive tests.
             Participation required internet access, a dog and some
             common household items. Participants could record their
             responses on any PC, tablet or smartphone from anywhere in
             the world and data were retained on servers. Results from
             citizen scientists and their dogs replicated a number of
             previously described phenomena from conventional lab-based
             research. There was little evidence that citizen scientists
             manipulated their results. To illustrate the potential uses
             of relatively large samples of citizen science data, we then
             used factor analysis to examine individual differences
             across the cognitive tasks. The data were best explained by
             multiple factors in support of the hypothesis that
             nonhumans, including dogs, can evolve multiple cognitive
             domains that vary independently. This analysis suggests that
             in the future, citizen scientists will generate useful
             datasets that test hypotheses and answer questions as a
             complement to conventional laboratory techniques used to
             study dog psychology.},
   Doi = {10.1371/journal.pone.0135176},
   Key = {fds322050}

   Author = {Turner, K and Mukherjee, S and Boyer, DM},
   Title = {Persistent homology transform for modeling shapes and
   Journal = {Information and Inference},
   Volume = {3},
   Number = {4},
   Pages = {310-344},
   Year = {2014},
   Month = {January},
   url = {},
   Abstract = {© The authors 2014. Published by Oxford University Press on
             behalf of the Institute of Mathematics and its Applications.
             All rights reserved. We introduce a statistic, the
             persistent homology transform (PHT), to model surfaces in R3
             and shapes in R2. This statistic is a collection of
             persistence diagrams-multiscale topological summaries used
             extensively in topological data analysis. We use the PHT to
             represent shapes and execute operations such as computing
             distances between shapes or classifying shapes. We provide a
             constructive proof that the map from the space of simplicial
             complexes in R3 into the space spanned by this statistic is
             injective. This implies that we can use it to determine a
             metric on the space of piecewise linear shapes. Stability
             results justify that we can approximate this metric using
             finitely many persistence diagrams. We illustrate the
             utility of this statistic on simulated and real
   Doi = {10.1093/imaiai/iau011},
   Key = {fds346294}

   Author = {Bonnefoi H and Potti A and Delorenzi M and Mauriac L and Campone M and Tubiana-Hulin M and Petit T and Rouanet P and Jassem J and Blot E and Becette V and Farmer P and André S and Acharya CR and Mukherjee S and Cameron D and Bergh J and Nevins JR and Iggo RD.},
   Title = {Validation of gene signatures that predict the response of
             breast cancer to neoadjuvant chemotherapy: a substudy of the
             EORTC 10994/BIG 00-01 clinical trial.},
   Journal = {Lancet Oncology},
   Volume = {8},
   Number = {12},
   Pages = {1071-1078},
   Year = {2007},
   Month = {December},
   url = {},
   Key = {fds139739}

   Author = {F. Liang and S. Mukherjee and M. West},
   Title = {Understanding the use of unlabelled data in predictive
   Journal = {Statistical Science},
   Volume = {22},
   Number = {2},
   Pages = {189-205},
   Year = {2007},
   Month = {Fall},
   Key = {fds139736}

   Author = {Jen-Tsan Chi1 and Edwin H. Rodriguez and Zhen Wang and Dimitry S. A.
             Nuyten and Sayan Mukherjee and Matt van de Rijn and Marc J. van de
             Vijver and Trevor Hastie and Patrick O. Brown},
   Title = {Gene Expression Programs of Human Smooth Muscle Cells:
             Tissue-Specific Differentiation and Prognostic Significance
             in Breast Cancers},
   Journal = {PLoS Genet},
   Volume = {3},
   Number = {9},
   Pages = {1770-1784},
   Year = {2007},
   Month = {September},
   url = {},
   Key = {fds139734}

   Author = {Natesh Pillai and Qiang Wu and Feng Liang and Sayan Mukherjee and Robert
             L. Wolpert},
   Title = {Characterizing the function space for Bayesian kernel
   Journal = {Journal of Machine Learning Research},
   Volume = {8},
   Pages = {1769--1797},
   Year = {2007},
   Month = {August},
   url = {},
   Abstract = {},
   Key = {fds70467}

   Author = {Liang Goh and Susan K. Murphy and Sayan Muhkerjee and Terrence S.
   Title = {Genomic sweeping for hypermethylated genes},
   Journal = {Bioinformatics},
   Volume = {23},
   Number = {3},
   Pages = {281-288},
   Year = {2007},
   Month = {February},
   url = {},
   Key = {fds51090}

   Author = {Zhong Wang and Huntington F. Willard and Sayan Mukherjee and Terrence
             S. Furey},
   Title = {Evidence of Influence of Genomic DNA Sequence on Human X
             Chromosome Inactivation},
   Journal = {Public Library of Science Computational Biology},
   Volume = {2},
   Number = {9},
   Pages = {979-988},
   Year = {2006},
   Month = {Winter},
   url = {},
   Key = {fds51089}

   Author = {S. Mukherjee and Q. Wu},
   Title = {Estimation of Gradients and Coordinate Covariation in
   Journal = {Journal of Machine Learning Research},
   Volume = {7},
   Pages = {2481--2514},
   Year = {2006},
   Month = {November},
   url = {},
   Key = {fds51092}

   Author = {S. Mukherjee and DX. Zhou},
   Title = {Learning Coordinate Covariances via Gradients},
   Journal = {Journal of Machine Learning Research},
   Volume = {7},
   Pages = {519-549},
   Year = {2006},
   Month = {March},
   url = {},
   Key = {fds46889}

   Author = {Elena Edelman and Alessandro Porrello and Justin Guinney and BalaBalakumaran, Andrea Bild and Phillip G. Febbo and Sayan
   Title = {Analysis of Sample Set Enrichment Scores: assaying the
             enrichment of sets of genes for individual samples in
             genome-wide expression profiles},
   Journal = {Bioinformatics},
   Volume = {22},
   Number = {14},
   Pages = {e101-e116},
   Year = {2006},
   url = {},
   Key = {fds51008}

   Author = {Daniela Tropea and Gabriel Kreiman and Alvin Lyckman and Sayan
             Mukherjee, Hongbo Yu and Sam Horng and Mriganka
   Title = {Gene expression changes and molecular pathways mediating
             activity-dependent plasticity in visual cortex},
   Journal = {Nature Neuroscience},
   Volume = {9},
   Pages = {660-668},
   Year = {2006},
   url = {},
   Key = {fds48249}

   Author = {A. Potti and S. Mukherjee and R. Petersen and HK. Dressman and A. Bild and J. Koontz and R. Kratzke and MA. Watson and M. Kelley},
   Title = {A Genomic Strategy to Refine Prognosis in Early Stage
             Non-Small Cell Lung Carcinoma},
   Journal = {New England Journal of Medicine},
   Volume = {355},
   Number = {6},
   Pages = {570-580},
   Year = {2006},
   url = {},
   Key = {fds51088}

   Author = {A. Subramanian and P. Tamayo and VK. Mootha and S. Mukherjee and BL.
             Ebert, MA. Gillette and A. Paulovich and SL. Pomeroy and TR. Golub and ES. Lander and JP. Mesirov},
   Title = {Gene set enrichment analysis: a knowledge-based approach for
             interpreting genome-wide expression profiles},
   Journal = {PNAS},
   Volume = {102},
   Number = {43},
   Pages = {15278-9},
   Year = {2005},
   Month = {October},
   url = {},
   Key = {fds46899}

   Author = {A. Rakhlin and D. Panchenko and S. Mukherjee},
   Title = {Risk Bounds for Mixture Density Estimation},
   Journal = {ESAIM: Probability and Statistics},
   Volume = {9},
   Pages = {220-229},
   Year = {2005},
   Month = {June},
   Key = {fds46886}

   Author = {Sweet-Cordero, A. and Mukherjee, S. and You, H. and Subramnian, S. and Ladd, C. and Roix, J. and Mesirov, J.P. and Golub, T.R. and Jacks, T},
   Title = {An oncogenic KRAS2 expression signature identified by
             cross-species gene-expression analysis},
   Journal = {Nature Genetics},
   Volume = {37},
   Number = {1},
   Pages = {48-55},
   Year = {2005},
   Month = {January},
   url = {},
   Key = {fds32735}

   Author = {A. Rakhlin and S. Mukherjee and T. Poggio},
   Title = {Stability Results In Learning Theory},
   Journal = {Analysis and Applications},
   Volume = {3},
   Number = {4},
   Pages = {397–417},
   Year = {2005},
   url = {},
   Key = {fds46888}

   Author = {P. Golland and F. Liang and S. Mukherjee and D. Panchenko},
   Title = {Permutation Tests for Classification},
   Pages = {501-515},
   Booktitle = {Proceedings of Computational Learning Theory
   Publisher = {Springer-Verlag},
   Editor = {P. Auer and R. Meir},
   Year = {2005},
   Key = {fds46887}

   Author = {S. Mukherjee and P. Niyogi and T. Poggio and R. Rifkin},
   Title = {Statistical Learning: Stability is Sufficient for
             Generalization and Necessary and Sufficient for Consistency
             of Empirical Risk Minimization},
   Journal = {Advances in Computational Mathematics},
   Volume = {25},
   Number = {1-3},
   Pages = {161 - 193},
   Year = {2005},
   url = {,10,17;journal,2,57;linkingpublicationresults,1:101738,1},
   Key = {fds46890}

   Author = {R. Berger and PG. Febbo and PK. Majumder and JJ. Zhao and S. Mukherjee and T Campbell and WR. Sellers and TM. Roberts and M. Loda and TR. Golub and WC. Hahan},
   Title = {Androgen-Induced Differentiation and Tumorigenicity of Human
             Prostate Epithelial Cells},
   Journal = {Cancer Research},
   Volume = {64},
   Pages = {8867-8875},
   Year = {2004},
   Month = {December},
   url = {},
   Key = {fds32736}

   Author = {T. Poggio and R. Rifkin and S. Mukherjee and P. Niyogi},
   Title = {Learning Theory: general conditions for predictivity},
   Journal = {Nature},
   Volume = {428},
   Pages = {419-422},
   Year = {2004},
   Month = {March},
   url = {},
   Key = {fds32740}

   Author = {R. Rifkin and S. Mukherjee and P. Tamayo and S. Ramaswamy and CH. Yeang and M. Reich and T. Poggio and ES. Lander and TR. Golub and JP.
   Title = {An Analytical Method for Multi-Class Cancer
   Journal = {SIAM Reviews},
   Volume = {45},
   Number = {4},
   Pages = {706-723},
   Year = {2003},
   Month = {Winter},
   url = {},
   Key = {fds32748}

   Author = {S. Mukherjee and P. Tamayo  and S. Rogers and R. Rifkin and A. Engle and C.
             Campbell, TR. Golub and JP. Mesirov},
   Title = {Estimating Dataset Size Requirements for Classifying DNA
             Microarray Data},
   Journal = {Journal of Computational Biology},
   Volume = {10},
   Number = {2},
   Pages = {119-142},
   Year = {2003},
   Month = {April},
   url = {},
   Key = {fds32741}

   Author = {LD. Miller and PM. Long and L. Wong and S. Mukherjee and LM. McShane and ET.
   Title = {Optimal gene expression analysis by microarrays},
   Journal = {Cancer Cell},
   Volume = {2},
   Pages = {353-361},
   Year = {2002},
   Month = {November},
   url = {},
   Key = {fds32749}

   Author = {S. Pomeroy and P. Tamayo and M. Gaasenbeek and L. Sturla and M. Angelo and M. McLaughlin and J. Kim and L. Goumnerova and P. Black and C. Lau and J.
             Allen, D. Zigzag and J. Olson and T. Curran and C. Wetmore and J.
             Biegel, T. Poggio and S. Mukherjee and R. Rifkin and A. Califano and G.
             Stolovitzky, D. Louis},
   Title = {Prediction of central nervous system embryonal tumour
             outcome based of gene expression},
   Journal = {Nature},
   Volume = {415},
   Number = {24},
   Pages = {436-442},
   Year = {2002},
   Month = {January},
   url = {},
   Key = {fds46897}

   Author = {Mukherjee, N and Mukherjee, S},
   Title = {Predicting signal peptides with support vector
   Journal = {Lecture notes in computer science},
   Volume = {2388},
   Pages = {1-7},
   Year = {2002},
   Month = {January},
   ISBN = {354044016X},
   url = {},
   Abstract = {© Springer-Verlag Berlin Heidelberg 2002.We examine using a
             Support Vector Machine to predict secretory signal peptides.
             We predict signal peptides for both prokaryotic and
             eukaryotic signal organisms. Signalling peptides versus
             non-signaling peptides as well as cleavage sites were
             predicted from a sequence of amino acids. Two types of
             kernels (each corresponding to different metrics) were used:
             hamming distance, a distance based upon the percent accepted
             mutation (PAM) score trained on the same signal peptide
   Doi = {10.1007/3-540-45665-1_1},
   Key = {fds323275}

   Author = {S. Ramaswamy and P. Tamayo and R. Rifkin and S. Mukherjee and CH Yeang and M. Angelo and C. Ladd and M. Reich and E. Latulippe and JP. Mesirov and T.
             Poggio, W. Gerald and M. Loda and ES. Lander and TR.
   Title = {Multiclass cancer diagnosis using tumor gene expression
   Journal = {PNAS},
   Volume = {98},
   Number = {26},
   Pages = {15149-15154},
   Year = {2001},
   Month = {December},
   url = {},
   Key = {fds46898}

   Author = {O. Chapelle and V. Vapnik and O. Bousquet and S. Mukherjee},
   Title = {Choosing Multiple Parameters for Support Vector
   Journal = {Machine Learning},
   Volume = {46},
   Number = {1-3},
   Pages = {131-159},
   Year = {2001},
   Month = {March},
   url = {,7,19;journal,36,205;linkingpublicationresults,1:100309,1},
   Key = {fds46893}

   Author = {Peshkin, L and Mukherjee, S},
   Title = {Bounds on sample size for policy evaluation in Markov
   Journal = {Lecture notes in computer science},
   Volume = {2111},
   Pages = {616-629},
   Year = {2001},
   Month = {January},
   ISSN = {0302-9743},
   Abstract = {© Springer-Verlag Berlin Heidelberg 2001.Reinforcement
             learning means finding the optimal course of action in
             Markovian environments without knowledge of the
             environment’s dynamics. Stochastic optimization algorithms
             used in the field rely on estimates of the value of a
             policy. Typically, the value of a policy is estimated from
             results of simulating that very policy in the environment.
             This approach requires a large amount of simulation as
             different points in the policy space are considered. In this
             paper, we develop value estimators that utilize data
             gathered when using one policy to estimate the value of
             using another policy, resulting in much more data-efficient
             algorithms. We consider the question of accumulating a
             sufficient experience and give PAC-style
   Key = {fds258516}

   Author = {J. Weston and S. Mukherjee and O. Chapelle and M. Pontil and T. Poggio and V. Vapnik},
   Title = {Feature Selection for SVMs},
   Volume = {14},
   Pages = {668-674},
   Booktitle = {Proceedings of Advances in Neural Information Processing
   Year = {2001},
   Key = {fds46894}

   Author = {CH Yeang and S. Ramaswamy and P. Tamayo and S. Mukherjee and R. Rifkin and M. Angelo and M. Reich and E. Lander and J. Mesirov and T.
   Title = {Molecular classification of multiple tumor
   Journal = {Bioinformatics},
   Volume = {1},
   Number = {1},
   Pages = {1-7},
   Year = {2001},
   url = {},
   Key = {fds46896}

   Author = {Pontil, M and Mukherjee, S and Girosi, F},
   Title = {On the noise model of support vector machines
   Journal = {Lecture notes in computer science},
   Volume = {1968},
   Pages = {316-324},
   Booktitle = {Proceedings of Algorithmic Learning Theory 11th
   Publisher = {Springer, Berlin},
   Year = {2000},
   Month = {January},
   ISBN = {9783540412373},
   Abstract = {© Springer-Verlag Berlin Heidelberg 2000.Support Vector
             Machines Regression (SVMR) is a learning technique where the
             goodness of fit is measured not by the usual quadratic loss
             function (the mean square error), but by a different loss
             function called the -Insensitive Loss Function (ILF), which
             is similar to loss functions used in the field of robust
             statistics. The quadratic loss function is well justified
             under the assumption of Gaussian additive noise. However,
             the noise model underlying the choice of the ILF is not
             clear. In this paper the use of the ILF is justified under
             the assumption that the noise is additive and Gaussian,
             where the variance and mean of the Gaussian are random
             variables. The probability distributions for the variance
             and mean will be stated explicitly. While this work is
             presented in the framework of SVMR, it can be extended to
             justify non-quadratic loss functions in any Maximum
             Likelihood or Maximum A Posteriori approach. It applies not
             only to the ILF, but to a much broader class of loss
   Key = {fds323276}

   Author = {V. Vapnik and S. Mukherjee},
   Title = {Support vector method for multivariate density
   Volume = {12},
   Pages = {659--665},
   Booktitle = {Proceedings of Advances in Neural Information Processing
   Editor = {S. A. Solla and T. K. Leen and K.R. Muller},
   Year = {2000},
   Key = {fds46895}

%% Papers Accepted   
   Author = {E. Edelman and J. Guinney and J-T. Chi and P.G. Febbo and S.
   Title = {Modeling Cancer Progression via Pathway Dependencies},
   Journal = {Public Library of Science Computational Biology},
   Year = {2007},
   url = {},
   Key = {fds139738}

%% Papers Submitted   
   Author = {Q. Wu and S. Mukherjee and F. Liang},
   Title = {Regularized sliced inverse regression for kernel
   Journal = {Biometrika},
   Year = {2007},
   url = {},
   Key = {fds139740}

   Author = {F. Liang and K. Mao and M. Liao and S. Mukherjee and M.
   Title = {Non-parametric Bayesian kernel models},
   Journal = {Biometrika},
   Year = {2007},
   url = {},
   Key = {fds70469}

   Author = {J. Guinney and Q. Wu and S. Mukherjee},
   Title = {Estimating variable structure and dependence in Multi-task
             learning via gradients},
   Journal = {Journal of Machine Learning Research},
   Year = {2007},
   url = {},
   Key = {fds70472}

   Author = {Q. Wu and J. Guinney and M. Maggioni and S. Mukherjee},
   Title = {Learning gradients: predictive models that infer geometry
             and dependence},
   Journal = {Journal of Machine Learning Research},
   Year = {2007},
   url = {},
   Key = {fds139737}

   Author = {S. Mukherjee and Q. Wu and D-X. Zhou},
   Title = {Learning Gradients and Feature Selection on
   Journal = {Annals of Statistics},
   Year = {2007},
   url = {},
   Key = {fds51095}

%% Chapters   
   Author = {Huang, B and Jarrett, NWD and Babu, S and Mukherjee, S and Yang,
   Title = {Cümülön: MatrixBased data analytics in the cloud with
             spot instances},
   Volume = {9},
   Pages = {156-167},
   Booktitle = {Proceedings of the VLDB Endowment},
   Year = {2016},
   Month = {January},
   Abstract = {We describe Cümülön, a system aimed at helping users
             develop and deploy matrix-based data analysis programs in a
             public cloud. A key feature of Cümülön is its end-to-end
             support for the so-called spot instances-machines whose
             market price fluctuates over time but is usually much lower
             than the regular fixed price. A user sets a bid price when
             acquiring spot instances, and loses them as soon as the
             market price exceeds the bid price. While spot instances can
             potentially save cost, they are difficult to use
             effectively, and run the risk of not finishing work while
             costing more. Cümülön provides a highly elastic
             computation and storage engine on top of spot instances, and
             offers automatic cost-based optimization of execution,
             deployment, and bidding strategies. Cümülön further
             quantifies how the uncertainty in the market price
             translates into the cost uncertainty of its recommendations,
             and allows users to specify their risk tolerance as an
             optimization constraint.},
   Key = {fds323273}
ph: 919.660.2800
fax: 919.660.2821

Mathematics Department
Duke University, Box 90320
Durham, NC 27708-0320