%% Books
@book{fds330642,
Author = {Rudin, C},
Title = {Turning prediction tools into decision tools},
Volume = {9356},
Year = {2015},
Month = {January},
ISBN = {9783319242811},
Key = {fds330642}
}
%% Papers Published
@article{fds374391,
Author = {Falcinelli, SD and Cooper-Volkheimer, AD and Semenova, L and Wu, E and Richardson, A and Ashokkumar, M and Margolis, DM and Archin, NM and Rudin, CD and Murdoch, D and Browne, EP},
Title = {Impact of Cannabis Use on Immune Cell Populations and the
Viral Reservoir in People With HIV on Suppressive
Antiretroviral Therapy.},
Journal = {J Infect Dis},
Volume = {228},
Number = {11},
Pages = {1600-1609},
Year = {2023},
Month = {November},
url = {http://dx.doi.org/10.1093/infdis/jiad364},
Abstract = {BACKGROUND: Human immunodeficiency virus (HIV) infection
remains incurable due to the persistence of a viral
reservoir despite antiretroviral therapy (ART). Cannabis
(CB) use is prevalent amongst people with HIV (PWH), but the
impact of CB on the latent HIV reservoir has not been
investigated. METHODS: Peripheral blood cells from a cohort
of PWH who use CB and a matched cohort of PWH who do not use
CB on ART were evaluated for expression of
maturation/activation markers, HIV-specific T-cell
responses, and intact proviral DNA. RESULTS: CB use was
associated with increased abundance of naive T cells,
reduced effector T cells, and reduced expression of
activation markers. CB use was also associated with reduced
levels of exhausted and senescent T cells compared to
nonusing controls. HIV-specific T-cell responses were
unaffected by CB use. CB use was not associated with intact
or total HIV DNA frequency in CD4 T cells. CONCLUSIONS: This
analysis is consistent with the hypothesis that CB use
reduces activation, exhaustion, and senescence in the T
cells of PWH, and does not impair HIV-specific CD8 T-cell
responses. Longitudinal and interventional studies with
evaluation of CB exposure are needed to fully evaluate the
impact of CB use on the HIV reservoir.},
Doi = {10.1093/infdis/jiad364},
Key = {fds374391}
}
@article{fds373363,
Author = {Garrett, BL and Rudin, C},
Title = {Interpretable algorithmic forensics.},
Journal = {Proceedings of the National Academy of Sciences of the
United States of America},
Volume = {120},
Number = {41},
Pages = {e2301842120},
Year = {2023},
Month = {October},
url = {http://dx.doi.org/10.1073/pnas.2301842120},
Abstract = {One of the most troubling trends in criminal investigations
is the growing use of "black box" technology, in which law
enforcement rely on artificial intelligence (AI) models or
algorithms that are either too complex for people to
understand or they simply conceal how it functions. In
criminal cases, black box systems have proliferated in
forensic areas such as DNA mixture interpretation, facial
recognition, and recidivism risk assessments. The champions
and critics of AI argue, mistakenly, that we face a catch
22: While black box AI is not understandable by people, they
assume that it produces more accurate forensic evidence. In
this Article, we question this assertion, which has so
powerfully affected judges, policymakers, and academics. We
describe a mature body of computer science research showing
how "glass box" AI-designed to be interpretable-can be more
accurate than black box alternatives. Indeed, black box AI
performs predictably <i>worse</i> in settings like the
criminal system. Debunking the black box performance myth
has implications for forensic evidence, constitutional
criminal procedure rights, and legislative policy. Absent
some compelling-or even credible-government interest in
keeping AI as a black box, and given the constitutional
rights and public safety interests at stake, we argue that a
substantial burden rests on the government to justify black
box AI in criminal cases. We conclude by calling for
judicial rulings and legislation to safeguard a right to
interpretable forensic AI.},
Doi = {10.1073/pnas.2301842120},
Key = {fds373363}
}
@article{fds372809,
Author = {Hahn, S and Zhu, R and Mak, S and Rudin, C and Jiang,
Y},
Title = {An Interpretable, Flexible, and Interactive Probabilistic
Framework for Melody Generation},
Journal = {Proceedings of the ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining},
Pages = {4089-4099},
Year = {2023},
Month = {August},
ISBN = {9798400701030},
url = {http://dx.doi.org/10.1145/3580305.3599772},
Abstract = {The fast-growing demand for algorithmic music generation is
found throughout entertainment, art, education, etc.
Unfortunately, most recent models are practically impossible
to interpret or musically fine-tune, as they use deep neural
networks with thousands of parameters. We introduce an
interpretable, flexible, and interactive model,
SchenkComposer, for melody generation that empowers users to
be creative in all aspects of the music generation pipeline
and allows them to learn from the process. We divide the
task of melody generation into steps based on the process
that a human composer using music-theoretical domain
knowledge might use. First, the model determines phrase
structure based on form analysis and identifies an
appropriate number of measures. Using concepts from
Schenkerian analysis, the model then finds a fitting
harmonic rhythm, middleground harmonic progression,
foreground rhythm, and melody in a hierarchical, scaffolded
approach using a probabilistic context-free grammar based on
musical contours. By incorporating theories of musical form
and harmonic structure, our model produces music with
long-term structural coherence. In extensive human
experiments, we find that music generated with our approach
successfully passes a Turing test in human experiments while
current state-of-the-art approaches fail, and we further
demonstrate superior performance and preference for our
melodies compared to existing melody generation methods.
Additionally, we developed and deployed a public website for
SchenkComposer, and conducted preliminary user surveys.
Through analysis, we show the strong viability and
enjoyability of SchenkComposer.},
Doi = {10.1145/3580305.3599772},
Key = {fds372809}
}
@article{fds371244,
Author = {Parikh, H and Hoffman, K and Sun, H and Zafar, SF and Ge, W and Jing, J and Liu, L and Sun, J and Struck, A and Volfovsky, A and Rudin, C and Westover,
MB},
Title = {Effects of epileptiform activity on discharge outcome in
critically ill patients in the USA: a retrospective
cross-sectional study.},
Journal = {The Lancet. Digital health},
Volume = {5},
Number = {8},
Pages = {e495-e502},
Year = {2023},
Month = {August},
url = {http://dx.doi.org/10.1016/s2589-7500(23)00088-2},
Abstract = {<h4>Background</h4>Epileptiform activity is associated with
worse patient outcomes, including increased risk of
disability and death. However, the effect of epileptiform
activity on neurological outcome is confounded by the
feedback between treatment with antiseizure medications and
epileptiform activity burden. We aimed to quantify the
heterogeneous effects of epileptiform activity with an
interpretability-centred approach.<h4>Methods</h4>We did a
retrospective, cross-sectional study of patients in the
intensive care unit who were admitted to Massachusetts
General Hospital (Boston, MA, USA). Participants were aged
18 years or older and had electrographic epileptiform
activity identified by a clinical neurophysiologist or
epileptologist. The outcome was the dichotomised modified
Rankin Scale (mRS) at discharge and the exposure was
epileptiform activity burden defined as mean or maximum
proportion of time spent with epileptiform activity in 6 h
windows in the first 24 h of electroencephalography. We
estimated the change in discharge mRS if everyone in the
dataset had experienced a specific epileptiform activity
burden and were untreated. We combined pharmacological
modelling with an interpretable matching method to account
for confounding and epileptiform activity-antiseizure
medication feedback. The quality of the matched groups was
validated by the neurologists.<h4>Findings</h4>Between Dec
1, 2011, and Oct 14, 2017, 1514 patients were admitted to
Massachusetts General Hospital intensive care unit, 995
(66%) of whom were included in the analysis. Compared with
patients with a maximum epileptiform activity of 0 to less
than 25%, patients with a maximum epileptiform activity
burden of 75% or more when untreated had a mean 22·27% (SD
0·92) increased chance of a poor outcome (severe disability
or death). Moderate but long-lasting epileptiform activity
(mean epileptiform activity burden 2% to <10%) increased the
risk of a poor outcome by mean 13·52% (SD 1·93). The
effect sizes were heterogeneous depending on preadmission
profile-eg, patients with hypoxic-ischaemic encephalopathy
or acquired brain injury were more adversely affected
compared with patients without these conditions.<h4>Interpretation</h4>Our
results suggest that interventions should put a higher
priority on patients with an average epileptiform activity
burden 10% or greater, and treatment should be more
conservative when maximum epileptiform activity burden is
low. Treatment should also be tailored to individual
preadmission profiles because the potential for epileptiform
activity to cause harm depends on age, medical history, and
reason for admission.<h4>Funding</h4>National Institutes of
Health and National Science Foundation.},
Doi = {10.1016/s2589-7500(23)00088-2},
Key = {fds371244}
}
@article{fds371608,
Author = {Peloquin, J and Kirillova, A and Rudin, C and Brinson, LC and Gall,
K},
Title = {Prediction of tensile performance for 3D printed
photopolymer gyroid lattices using structural porosity, base
material properties, and machine learning},
Journal = {Materials and Design},
Volume = {232},
Year = {2023},
Month = {August},
url = {http://dx.doi.org/10.1016/j.matdes.2023.112126},
Abstract = {Advancements in additive manufacturing (AM) technology and
three-dimensional (3D) modeling software have enabled the
fabrication of parts with combinations of properties that
were impossible to achieve with traditional manufacturing
techniques. Porous designs such as truss-based and
sheet-based lattices have gained much attention in recent
years due to their versatility. The multitude of lattice
design possibilities, coupled with a growing list of
available 3D printing materials, has provided a vast range
of 3D printable structures that can be used to achieve
desired performance. However, the process of computationally
or experimentally evaluating many combinations of base
material and lattice design for a given application is
impractical. This research proposes a framework for quickly
predicting key mechanical properties of 3D printed gyroid
lattices using information about the base material and
porosity of the structure. Experimental data was gathered to
train a simple, interpretable, and accurate kernel ridge
regression machine learning model. The performance of the
model was then compared to numerical simulation data and
demonstrated similar accuracy at a fraction of the
computation time. Ultimately, the model development serves
as an advancement in ML-driven mechanical property
prediction that can be used to guide extension of current
and future models.},
Doi = {10.1016/j.matdes.2023.112126},
Key = {fds371608}
}
@article{fds371888,
Author = {Peloquin, J and Kirillova, A and Mathey, E and Rudin, C and Brinson, LC and Gall, K},
Title = {Tensile performance data of 3D printed photopolymer gyroid
lattices.},
Journal = {Data in brief},
Volume = {49},
Pages = {109396},
Year = {2023},
Month = {August},
url = {http://dx.doi.org/10.1016/j.dib.2023.109396},
Abstract = {Additive manufacturing has provided the ability to
manufacture complex structures using a wide variety of
materials and geometries. Structures such as triply periodic
minimal surface (TPMS) lattices have been incorporated into
products across many fields due to their unique combinations
of mechanical, geometric, and physical properties. Yet, the
near limitless possibility of combining geometry and
material into these lattices leaves much to be discovered.
This article provides a dataset of experimentally gathered
tensile stress-strain curves and measured porosity values
for 389 unique gyroid lattice structures manufactured using
vat photopolymerization 3D printing. The lattice samples
were printed from one of twenty different photopolymer
materials available from either Formlabs, LOCTITE AM, or
ETEC that range from strong and brittle to elastic and
ductile and were printed on commercially available 3D
printers, specifically the Formlabs Form2, Prusa SL1, and
ETEC Envision One cDLM Mechanical. The stress-strain curves
were recorded with an MTS Criterion C43.504 mechanical
testing apparatus and following ASTM standards, and the void
fraction or "porosity" of each lattice was measured using a
calibrated scale. This data serves as a valuable resource
for use in the development of novel printing materials and
lattice geometries and provides insight into the influence
of photopolymer material properties on the printability,
geometric accuracy, and mechanical performance of 3D printed
lattice structures. The data described in this article was
used to train a machine learning model capable of predicting
mechanical properties of 3D printed gyroid lattices based on
the base mechanical properties of the printing material and
porosity of the lattice in the research article
[1].},
Doi = {10.1016/j.dib.2023.109396},
Key = {fds371888}
}
@article{fds373056,
Author = {McDonald, SM and Augustine, EK and Lanners, Q and Rudin, C and Catherine
Brinson, L and Becker, ML},
Title = {Applied machine learning as a driver for polymeric
biomaterials design.},
Journal = {Nature communications},
Volume = {14},
Number = {1},
Pages = {4838},
Year = {2023},
Month = {August},
url = {http://dx.doi.org/10.1038/s41467-023-40459-8},
Abstract = {Polymers are ubiquitous to almost every aspect of modern
society and their use in medical products is similarly
pervasive. Despite this, the diversity in commercial
polymers used in medicine is stunningly low. Considerable
time and resources have been extended over the years towards
the development of new polymeric biomaterials which address
unmet needs left by the current generation of medical-grade
polymers. Machine learning (ML) presents an unprecedented
opportunity in this field to bypass the need for
trial-and-error synthesis, thus reducing the time and
resources invested into new discoveries critical for
advancing medical treatments. Current efforts pioneering
applied ML in polymer design have employed combinatorial and
high throughput experimental design to address data
availability concerns. However, the lack of available and
standardized characterization of parameters relevant to
medicine, including degradation time and biocompatibility,
represents a nearly insurmountable obstacle to ML-aided
design of biomaterials. Herein, we identify a gap at the
intersection of applied ML and biomedical polymer design,
highlight current works at this junction more broadly and
provide an outlook on challenges and future
directions.},
Doi = {10.1038/s41467-023-40459-8},
Key = {fds373056}
}
@article{fds372436,
Author = {Zhang, R and Xin, R and Seltzer, M and Rudin, C},
Title = {Optimal Sparse Regression Trees},
Journal = {Proceedings of the 37th AAAI Conference on Artificial
Intelligence, AAAI 2023},
Volume = {37},
Pages = {11270-11279},
Year = {2023},
Month = {June},
ISBN = {9781577358800},
Abstract = {Regression trees are one of the oldest forms of AI models,
and their predictions can be made without a calculator,
which makes them broadly useful, particularly for
high-stakes applications. Within the large literature on
regression trees, there has been little effort towards full
provable optimization, mainly due to the computational
hardness of the problem. This work proposes a
dynamic-programming-with-bounds approach to the construction
of provably-optimal sparse regression trees. We leverage a
novel lower bound based on an optimal solution to the
k-Means clustering algorithm on one dimensional data. We are
often able to find optimal sparse trees in seconds, even for
challenging datasets that involve large numbers of samples
and highly-correlated features.},
Key = {fds372436}
}
@article{fds362970,
Author = {Wang, C and Han, B and Patel, B and Rudin, C},
Title = {In Pursuit of Interpretable, Fair and Accurate Machine
Learning for Criminal Recidivism Prediction},
Journal = {Journal of Quantitative Criminology},
Volume = {39},
Number = {2},
Pages = {519-581},
Year = {2023},
Month = {June},
url = {http://dx.doi.org/10.1007/s10940-022-09545-w},
Abstract = {Objectives: We study interpretable recidivism prediction
using machine learning (ML) models and analyze performance
in terms of prediction ability, sparsity, and fairness.
Unlike previous works, this study trains interpretable
models that output probabilities rather than binary
predictions, and uses quantitative fairness definitions to
assess the models. This study also examines whether models
can generalize across geographic locations. Methods: We
generated black-box and interpretable ML models on two
different criminal recidivism datasets from Florida and
Kentucky. We compared predictive performance and fairness of
these models against two methods that are currently used in
the justice system to predict pretrial recidivism: the
Arnold PSA and COMPAS. We evaluated predictive performance
of all models on predicting six different types of crime
over two time spans. Results: Several interpretable ML
models can predict recidivism as well as black-box ML models
and are more accurate than COMPAS or the Arnold PSA. These
models are potentially useful in practice. Similar to the
Arnold PSA, some of these interpretable models can be
written down as a simple table. Others can be displayed
using a set of visualizations. Our geographic analysis
indicates that ML models should be trained separately for
separate locations and updated over time. We also present a
fairness analysis for the interpretable models. Conclusions:
Interpretable ML models can perform just as well as
non-interpretable methods and currently-used risk assessment
scales, in terms of both prediction accuracy and fairness.
ML models might be more accurate when trained separately for
distinct locations and kept up-to-date.},
Doi = {10.1007/s10940-022-09545-w},
Key = {fds362970}
}
@article{fds371245,
Author = {Ou, YJ and Barnett, AJ and Mitra, A and Schwartz, FR and Chen, C and Grimm,
L and Lo, JY and Rudin, C},
Title = {A user interface to communicate interpretable AI decisions
to radiologists},
Journal = {Progress in Biomedical Optics and Imaging - Proceedings of
SPIE},
Volume = {12467},
Year = {2023},
Month = {January},
ISBN = {9781510660397},
url = {http://dx.doi.org/10.1117/12.2654068},
Abstract = {Tools for computer-aided diagnosis based on deep learning
have become increasingly important in the medical field.
Such tools can be useful, but require effective
communication of their decision-making process in order to
safely and meaningfully guide clinical decisions. Inherently
interpretable models provide an explanation for each
decision that matches their internal decision-making
process. We present a user interface that incorporates the
Interpretable AI Algorithm for Breast Lesions (IAIA-BL)
model, which interpretably predicts both mass margin and
malignancy for breast lesions. The user interface displays
the most relevant aspects of the model's explanation
including the predicted margin value, the AI confidence in
the prediction, and the two most highly activated prototypes
for each case. In addition, this user interface includes
full-field and cropped images of the region of interest, as
well as a questionnaire suitable for a reader study. Our
preliminary results indicate that the model increases the
readers' confidence and accuracy in their decisions on
margin and malignancy.},
Doi = {10.1117/12.2654068},
Key = {fds371245}
}
@article{fds372810,
Author = {Lanners, Q and Parikh, H and Volfovsky, A and Rudin, C and Page,
D},
Title = {Variable Importance Matching for Causal Inference},
Journal = {Proceedings of Machine Learning Research},
Volume = {216},
Pages = {1174-1184},
Year = {2023},
Month = {January},
Abstract = {Our goal is to produce methods for observational causal
inference that are auditable, easy to troubleshoot, accurate
for treatment effect estimation, and scalable to
high-dimensional data. We describe a general framework
called Model-to-Match that achieves these goals by (i)
learning a distance metric via outcome modeling, (ii)
creating matched groups using the distance metric, and (iii)
using the matched groups to estimate treatment effects.
Model-to-Match uses variable importance measurements to
construct a distance metric, making it a flexible framework
that can be adapted to various applications. Concentrating
on the scalability of the problem in the number of potential
confounders, we operationalize the Model-to-Match framework
with LASSO. We derive performance guarantees for settings
where LASSO outcome modeling consistently identifies all
confounders (importantly without requiring the linear model
to be correctly specified). We also provide experimental
results demonstrating the method's auditability, accuracy,
and scalability as well as extensions to more general
nonparametric outcome modeling.},
Key = {fds372810}
}
@article{fds373003,
Author = {Agnew, E and Qiu, M and Zhu, L and Wiseman, S and Rudin,
C},
Title = {The Mechanical Bard: An Interpretable Machine Learning
Approach to Shakespearean Sonnet Generation},
Journal = {Proceedings of the Annual Meeting of the Association for
Computational Linguistics},
Volume = {2},
Pages = {1627-1638},
Year = {2023},
Month = {January},
ISBN = {9781959429715},
Abstract = {We consider the automated generation of sonnets, a poetic
form constrained according to meter, rhyme scheme, and
length. Sonnets generally also use rhetorical figures,
expressive language, and a consistent theme or narrative.
Our constrained decoding approach allows for the generation
of sonnets within preset poetic constraints, while using a
relatively modest neural backbone. Human evaluation confirms
that our approach produces Shakespearean sonnets that
resemble human-authored sonnets, and which adhere to the
genre’s defined constraints and contain lyrical language
and literary devices.},
Key = {fds373003}
}
@article{fds373531,
Author = {Chen, Z and Tan, S and Chajewska, U and Rudin, C and Caruana,
R},
Title = {Missing Values and Imputation in Healthcare Data: Can
Interpretable Machine Learning Help?},
Journal = {Proceedings of Machine Learning Research},
Volume = {209},
Pages = {86-99},
Year = {2023},
Month = {January},
Abstract = {Missing values are a fundamental problem in data science.
Many datasets have missing values that must be properly
handled because the way missing values are treated can have
large impact on the resulting machine learning model. In
medical applications, the consequences may affect healthcare
decisions. There are many methods in the literature for
dealing with missing values, including state-of-the-art
methods which often depend on black-box models for
imputation. In this work, we show how recent advances in
interpretable machine learning provide a new perspective for
understanding and tackling the missing value problem. We
propose methods based on high-accuracy glass-box Explainable
Boosting Machines (EBMs) that can help users (1) gain new
insights on missingness mechanisms and better understand the
causes of missingness, and (2) detect – or even alleviate
– potential risks introduced by imputation algorithms.
Experiments on real-world medical datasets illustrate the
effectiveness of the proposed methods.},
Key = {fds373531}
}
@article{fds368049,
Author = {Garrett, BL and Rudin, C},
Title = {The Right to a Glass Box: Rethinking the Use of Artificial
Intelligence in Criminal Justice},
Journal = {Cornell Law Review},
Year = {2023},
Key = {fds368049}
}
@article{fds367642,
Author = {Rudin, C},
Title = {Why black box machine learning should be avoided for
high-stakes decisions, in brief},
Journal = {Nature Reviews Methods Primers},
Volume = {2},
Number = {1},
Year = {2022},
Month = {December},
url = {http://dx.doi.org/10.1038/s43586-022-00172-0},
Doi = {10.1038/s43586-022-00172-0},
Key = {fds367642}
}
@article{fds368343,
Author = {Chen, Z and Ogren, A and Daraio, C and Brinson, LC and Rudin,
C},
Title = {How to see hidden patterns in metamaterials with
interpretable machine learning},
Journal = {Extreme Mechanics Letters},
Volume = {57},
Year = {2022},
Month = {November},
url = {http://dx.doi.org/10.1016/j.eml.2022.101895},
Abstract = {Machine learning models can assist with metamaterials design
by approximating computationally expensive simulators or
solving inverse design problems. However, past work has
usually relied on black box deep neural networks, whose
reasoning processes are opaque and require enormous datasets
that are expensive to obtain. In this work, we develop two
novel machine learning approaches to metamaterials discovery
that have neither of these disadvantages. These approaches,
called shape-frequency features and unit-cell templates, can
discover 2D metamaterials with user-specified frequency band
gaps. Our approaches provide logical rule-based conditions
on metamaterial unit-cells that allow for interpretable
reasoning processes, and generalize well across design
spaces of different resolutions. The templates also provide
design flexibility where users can almost freely design the
fine resolution features of a unit-cell without affecting
the user's desired band gap.},
Doi = {10.1016/j.eml.2022.101895},
Key = {fds368343}
}
@article{fds369058,
Author = {Behrouz, A and Lécuyer, M and Rudin, C and Seltzer,
M},
Title = {Fast Optimization of Weighted Sparse Decision Trees for use
in Optimal Treatment Regimes and Optimal Policy
Design.},
Journal = {CEUR workshop proceedings},
Volume = {3318},
Pages = {26},
Year = {2022},
Month = {October},
Abstract = {Sparse decision trees are one of the most common forms of
interpretable models. While recent advances have produced
algorithms that fully optimize sparse decision trees for
<i>prediction</i>, that work does not address <i>policy
design</i>, because the algorithms cannot handle weighted
data samples. Specifically, they rely on the discreteness of
the loss function, which means that real-valued weights
cannot be directly used. For example, none of the existing
techniques produce policies that incorporate inverse
propensity weighting on individual data points. We present
three algorithms for efficient sparse weighted decision tree
optimization. The first approach directly optimizes the
weighted loss function; however, it tends to be
computationally inefficient for large datasets. Our second
approach, which scales more efficiently, transforms weights
to integer values and uses data duplication to transform the
weighted decision tree optimization problem into an
unweighted (but larger) counterpart. Our third algorithm,
which scales to much larger datasets, uses a randomized
procedure that samples each data point with a probability
proportional to its weight. We present theoretical bounds on
the error of the two fast methods and show experimentally
that these methods can be two orders of magnitude faster
than the direct optimization of the weighted loss, without
losing significant accuracy.},
Key = {fds369058}
}
@article{fds369818,
Author = {Parikh, H and Rudin, C and Volfovsky, A},
Title = {MALTS: Matching After Learning to Stretch},
Journal = {Journal of Machine Learning Research},
Volume = {23},
Year = {2022},
Month = {August},
Abstract = {We introduce a flexible framework that produces high-quality
almost-exact matches for causal inference. Most prior work
in matching uses ad-hoc distance metrics, often leading to
poor quality matches, particularly when there are irrelevant
covariates. In this work, we learn an interpretable distance
metric for matching, which leads to substantially higher
quality matches. The learned distance metric stretches the
covariate space according to each covariate’s contribution
to outcome prediction: this stretching means that mismatches
on important covariates carry a larger penalty than
mismatches on irrelevant covariates. Our ability to learn
flexible distance metrics leads to matches that are
interpretable and useful for the estimation of conditional
average treatment effects.},
Key = {fds369818}
}
@article{fds363364,
Author = {Afnan, M and Afnan, MAM and Liu, Y and Savulescu, J and Mishra, A and Conitzer, V and Rudin, C},
Title = {Data solidarity for machine learning for embryo selection: a
call for the creation of an open access repository of embryo
data.},
Journal = {Reproductive biomedicine online},
Volume = {45},
Number = {1},
Pages = {10-13},
Year = {2022},
Month = {July},
url = {http://dx.doi.org/10.1016/j.rbmo.2022.03.015},
Abstract = {The last decade has seen an explosion of machine learning
applications in healthcare, with mixed and sometimes harmful
results despite much promise and associated hype. A
significant reason for the reversal in the reported benefit
of these applications is the premature implementation of
machine learning algorithms in clinical practice. This paper
argues the critical need for 'data solidarity' for machine
learning for embryo selection. A recent Lancet and Financial
Times commission defined data solidarity as 'an approach to
the collection, use, and sharing of health data and data for
health that safeguards individual human rights while
building a culture of data justice and equity, and ensuring
that the value of data is harnessed for public good'
(Kickbusch et al., 2021).},
Doi = {10.1016/j.rbmo.2022.03.015},
Key = {fds363364}
}
@article{fds364197,
Author = {Huang, H and Wang, Y and Rudin, C and Browne, EP},
Title = {Towards a comprehensive evaluation of dimension reduction
methods for transcriptomic data visualization.},
Journal = {Communications biology},
Volume = {5},
Number = {1},
Pages = {719},
Year = {2022},
Month = {July},
url = {http://dx.doi.org/10.1038/s42003-022-03628-x},
Abstract = {Dimension reduction (DR) algorithms project data from high
dimensions to lower dimensions to enable visualization of
interesting high-dimensional structure. DR algorithms are
widely used for analysis of single-cell transcriptomic data.
Despite widespread use of DR algorithms such as t-SNE and
UMAP, these algorithms have characteristics that lead to
lack of trust: they do not preserve important aspects of
high-dimensional structure and are sensitive to arbitrary
user choices. Given the importance of gaining insights from
DR, DR methods should be evaluated carefully before trusting
their results. In this paper, we introduce and perform a
systematic evaluation of popular DR methods, including
t-SNE, art-SNE, UMAP, PaCMAP, TriMap and ForceAtlas2. Our
evaluation considers five components: preservation of local
structure, preservation of global structure, sensitivity to
parameter choices, sensitivity to preprocessing choices, and
computational efficiency. This evaluation can help us to
choose DR tools that align with the scientific goals of the
user.},
Doi = {10.1038/s42003-022-03628-x},
Key = {fds364197}
}
@article{fds364039,
Author = {Semenova, L and Rudin, C and Parr, R},
Title = {On the Existence of Simpler Machine Learning
Models},
Journal = {ACM International Conference Proceeding Series},
Pages = {1827-1858},
Year = {2022},
Month = {June},
ISBN = {9781450393522},
url = {http://dx.doi.org/10.1145/3531146.3533232},
Abstract = {It is almost always easier to find an accurate-but-complex
model than an accurate-yet-simple model. Finding optimal,
sparse, accurate models of various forms (linear models with
integer coefficients, decision sets, rule lists, decision
trees) is generally NP-hard. We often do not know whether
the search for a simpler model will be worthwhile, and thus
we do not go to the trouble of searching for one. In this
work, we ask an important practical question: can
accurate-yet-simple models be proven to exist, or shown
likely to exist, before explicitly searching for them? We
hypothesize that there is an important reason that
simple-yet-accurate models often do exist. This hypothesis
is that the size of the Rashomon set is often large, where
the Rashomon set is the set of almost-equally-accurate
models from a function class. If the Rashomon set is large,
it contains numerous accurate models, and perhaps at least
one of them is the simple model we desire. In this work, we
formally present the Rashomon ratio as a new gauge of
simplicity for a learning problem, depending on a function
class and a data set. The Rashomon ratio is the ratio of the
volume of the set of accurate models to the volume of the
hypothesis space, and it is different from standard
complexity measures from statistical learning theory.
Insight from studying the Rashomon ratio provides an easy
way to check whether a simpler model might exist for a
problem before finding it, namely whether several different
machine learning methods achieve similar performance on the
data. In that sense, the Rashomon ratio is a powerful tool
for understanding why and when an accurate-yet-simple model
might exist. If, as we hypothesize in this work, many
real-world data sets admit large Rashomon sets, the
implications are vast: it means that simple or interpretable
models may often be used for high-stakes decisions without
losing accuracy.},
Doi = {10.1145/3531146.3533232},
Key = {fds364039}
}
@article{fds364272,
Author = {Wang, T and Rudin, C},
Title = {Causal Rule Sets for Identifying Subgroups with Enhanced
Treatment Effects},
Journal = {INFORMS Journal on Computing},
Volume = {34},
Number = {3},
Pages = {1626-1643},
Year = {2022},
Month = {May},
url = {http://dx.doi.org/10.1287/ijoc.2021.1143},
Abstract = {A key question in causal inference analyses is how to find
subgroups with elevated treatment effects. This paper takes
a machine learning approach and introduces a generative
model, causal rule sets (CRS), for interpretable subgroup
discovery. A CRS model uses a small set of short decision
rules to capture a subgroup in which the average treatment
effect is elevated. We present a Bayesian framework for
learning a causal rule set. The Bayesian model consists of a
prior that favors simple models for better interpretability
as well as avoiding overfitting and a Bayesian logistic
regression that captures the likelihood of data,
characterizing the relation between outcomes, attributes,
and subgroup membership. The Bayesian model has tunable
parameters that can characterize subgroups with various
sizes, providing users with more flexible choices of models
from the treatment- efficient frontier. We find maximum a
posteriori models using iterative discrete Monte Carlo steps
in the joint solution space of rules sets and parameters. To
improve search efficiency, we provide theoretically grounded
heuristics and bounding strategies to prune and confine the
search space. Experiments show that the search algorithm can
efficiently recover true underlying subgroups.We apply CRS
on public and real-world data sets from domains in which
interpretability is indispensable. We compare CRS with
state-of-the-art rule-based subgroup discovery models.
Results show that CRS achieves consistently competitive
performance on data sets from various domains, represented
by high treatmentefficient frontiers. Summary of
Contribution: This paper is motivated by the large
heterogeneity of treatment effect in many applications and
the need to accurately locate subgroups for enhanced
treatment effect. Existingmethods either rely on prior
hypotheses to discover subgroups or greedy methods, such as
tree-based recursive partitioning. Our method adopts a
machine learning approach to find an optimal subgroup
learned with a carefully global objective. Our model is more
flexible in capturing subgroups by using a set of short
decision rules compared with tree-based baselines.We
evaluate ourmodel using a novel metric, treatment-efficient
frontier, that characterizes the trade-off between the
subgroup size and achievable treatment effect, and ourmodel
demonstrates better performance than baselinemodels.
Copyright:},
Doi = {10.1287/ijoc.2021.1143},
Key = {fds364272}
}
@article{fds371609,
Author = {Liu, J and Zhong, C and Seltzer, M and Rudin, C},
Title = {Fast Sparse Classification for Generalized Linear and
Additive Models.},
Journal = {Proceedings of machine learning research},
Volume = {151},
Pages = {9304-9333},
Year = {2022},
Month = {March},
Abstract = {We present fast classification techniques for sparse
generalized linear and additive models. These techniques can
handle thousands of features and thousands of observations
in minutes, even in the presence of many highly correlated
features. For fast sparse logistic regression, our
computational speed-up over other best-subset search
techniques owes to linear and quadratic surrogate cuts for
the logistic loss that allow us to efficiently screen
features for elimination, as well as use of a priority queue
that favors a more uniform exploration of features. As an
alternative to the logistic loss, we propose the exponential
loss, which permits an analytical solution to the line
search at each iteration. Our algorithms are generally 2 to
5 times faster than previous approaches. They produce
interpretable models that have accuracy comparable to black
box models on challenging datasets.},
Key = {fds371609}
}
@article{fds358741,
Author = {Chen, C and Lin, K and Rudin, C and Shaposhnik, Y and Wang, S and Wang,
T},
Title = {A holistic approach to interpretability in financial
lending: Models, visualizations, and summary-explanations},
Journal = {Decision Support Systems},
Volume = {152},
Year = {2022},
Month = {January},
url = {http://dx.doi.org/10.1016/j.dss.2021.113647},
Abstract = {Lending decisions are usually made with proprietary models
that provide minimally acceptable explanations to users. In
a future world without such secrecy, what decision support
tools would one want to use for justified lending decisions?
This question is timely, since the economy has dramatically
shifted due to a pandemic, and a massive number of new loans
will be necessary in the short term. We propose a framework
for such decisions, including a globally interpretable
machine learning model, an interactive visualization of it,
and several types of summaries and explanations for any
given decision. The machine learning model is a two-layer
additive risk model, which resembles a two-layer neural
network, but is decomposable into subscales. In this model,
each node in the first (hidden) layer represents a
meaningful subscale model, and all of the nonlinearities are
transparent. Our online visualization tool allows
exploration of this model, showing precisely how it came to
its conclusion. We provide three types of explanations that
are simpler than, but consistent with, the global model:
case-based reasoning explanations that use neighboring past
cases, a set of features that were the most important for
the model's prediction, and summary-explanations that
provide a customized sparse explanation for any particular
lending decision made by the model. Our framework earned the
FICO recognition award for the Explainable Machine Learning
Challenge, which was the first public challenge in the
domain of explainable machine learning.1},
Doi = {10.1016/j.dss.2021.113647},
Key = {fds358741}
}
@article{fds362508,
Author = {Rudin, C and Chen, C and Chen, Z and Huang, H and Semenova, L and Zhong,
C},
Title = {Interpretable machine learning: Fundamental principles and
10 grand challenges},
Journal = {Statistics Surveys},
Volume = {16},
Pages = {1-85},
Year = {2022},
Month = {January},
url = {http://dx.doi.org/10.1214/21-SS133},
Abstract = {Interpretability in machine learning (ML) is crucial for
high stakes decisions and troubleshooting. In this work, we
provide fundamental principles for interpretable ML, and
dispel common misunderstandings that dilute the importance
of this crucial topic. We also identify 10 technical
challenge areas in interpretable machine learning and
provide history and background on each problem. Some of
these problems are classically important, and some are
recent problems that have arisen in the last few years.
These problems are: (1) Optimizing sparse logical models
such as decision trees; (2) Optimization of scoring systems;
(3) Placing constraints into generalized additive models to
encourage sparsity and better interpretability; (4) Modern
case-based reasoning, including neural networks and matching
for causal inference; (5) Complete supervised
disentanglement of neural networks; (6) Complete or even
partial unsupervised disentanglement of neural networks; (7)
Dimensionality reduction for data visualization; (8) Machine
learning models that can incorporate physics and other
generative or causal constraints; (9) Characterization of
the “Rashomon set” of good models; and (10)
Interpretable reinforcement learning. This survey is
suitable as a starting point for statisticians and computer
scientists interested in working in interpretable machine
learning.},
Doi = {10.1214/21-SS133},
Key = {fds362508}
}
@article{fds363411,
Author = {Li, C and Rudin, C and McCormick, TH},
Title = {Rethinking Nonlinear Instrumental Variable Models through
Prediction Validity},
Journal = {Journal of Machine Learning Research},
Volume = {23},
Year = {2022},
Month = {January},
Abstract = {Instrumental variables (IV) are widely used in the social
and health sciences in situations where a researcher would
like to measure a causal effect but cannot perform an
experiment. For valid causal inference in an IV model, there
must be external (exogenous) variation that (i) has a
suffciently large impact on the variable of interest (called
the relevance as- sumption) and where (ii) the only pathway
through which the external variation impacts the outcome is
via the variable of interest (called the exclusion
restriction). For statistical inference, researchers must
also make assumptions about the functional form of the
relationship between the three variables. Current practice
assumes (i) and (ii) are met, then postulates a functional
form with limited input from the data. In this paper, we
describe a framework that leverages machine learning to
validate these typically unchecked but consequential
assumptions in the IV framework, providing the researcher
empirical evidence about the quality of the instrument given
the data at hand. Central to the proposed approach is the
idea of prediction validity. Prediction validity checks that
error terms { which should be independent from the
instrument { cannot be modeled with machine learning any
better than a model that is identically zero. We use
prediction validity to develop both one-stage and two-stage
approaches for IV, and demonstrate their performance on an
example relevant to climate change policy.},
Key = {fds363411}
}
@article{fds369682,
Author = {McTavish, H and Zhong, C and Achermann, R and Karimalis, I and Chen, J and Rudin, C and Seltzer, M},
Title = {Fast Sparse Decision Tree Optimization via Reference
Ensembles.},
Journal = {Proceedings of the ... AAAI Conference on Artificial
Intelligence. AAAI Conference on Artificial
Intelligence},
Volume = {36},
Number = {9},
Pages = {9604-9613},
Year = {2022},
Month = {January},
url = {http://dx.doi.org/10.1609/aaai.v36i9.21194},
Abstract = {Sparse decision tree optimization has been one of the most
fundamental problems in AI since its inception and is a
challenge at the core of interpretable machine learning.
Sparse decision tree optimization is computationally hard,
and despite steady effort since the 1960's, breakthroughs
have been made on the problem only within the past few
years, primarily on the problem of finding optimal sparse
decision trees. However, current state-of-the-art algorithms
often require impractical amounts of computation time and
memory to find optimal or near-optimal trees for some
real-world datasets, particularly those having several
continuous-valued features. Given that the search spaces of
these decision tree optimization problems are massive, can
we practically hope to find a sparse decision tree that
competes in accuracy with a black box machine learning
model? We address this problem via smart guessing strategies
that can be applied to any optimal branch-and-bound-based
decision tree algorithm. The guesses come from knowledge
gleaned from black box models. We show that by using these
guesses, we can reduce the run time by multiple orders of
magnitude while providing bounds on how far the resulting
trees can deviate from the black box's accuracy and
expressive power. Our approach enables guesses about how to
bin continuous features, the size of the tree, and lower
bounds on the error for the optimal decision tree. Our
experiments show that in many cases we can rapidly construct
sparse decision trees that match the accuracy of black box
models. To summarize: <i>when you are having trouble
optimizing</i>, <i>just guess</i>.},
Doi = {10.1609/aaai.v36i9.21194},
Key = {fds369682}
}
@article{fds363833,
Author = {Barnett, AJ and Sharma, V and Gajjar, N and Fang, J and Schwartz, FR and Chen, C and Lo, JY and Rudin, C},
Title = {Interpretable Deep Learning Models for Better Clinician-AI
Communication in Clinical Mammography},
Journal = {Progress in Biomedical Optics and Imaging - Proceedings of
SPIE},
Volume = {12035},
Year = {2022},
Month = {January},
ISBN = {9781510649453},
url = {http://dx.doi.org/10.1117/12.2612372},
Abstract = {There is increasing interest in using deep learning and
computer vision to help guide clinical decisions, such as
whether to order a biopsy based on a mammogram. Existing
networks are typically black box, unable to explain how they
make their predictions. We present an interpretable
deep-learning network which explains its predictions in
terms of BI-RADS features mass shape and mass margin. Our
model predicts mass margin and mass shape, then uses the
logits from those interpretable models to predict
malignancy, also using an interpretable model. The
interpretable mass margin model explains its predictions
using a prototypical parts model. The interpretable mass
shape model predicts segmentations, fits an ellipse, then
determines shape based on the goodness of fit and
eccentricity of the fitted ellipse. While including mass
shape logits in the malignancy prediction model did not
improve performance, we present this technique as part of a
framework for better clinician-AI communication. 2022
SPIE.},
Doi = {10.1117/12.2612372},
Key = {fds363833}
}
@article{fds368882,
Author = {Wang, ZJ and Zhong, C and Xin, R and Takagi, T and Chen, Z and Chau, DH and Rudin, C and Seltzer, M},
Title = {TimberTrek: Exploring and Curating Sparse Decision Trees
with Interactive Visualization},
Journal = {Proceedings - 2022 IEEE Visualization Conference - Short
Papers, VIS 2022},
Pages = {60-64},
Year = {2022},
Month = {January},
ISBN = {9781665488129},
url = {http://dx.doi.org/10.1109/VIS54862.2022.00021},
Abstract = {Given thousands of equally accurate machine learning (ML)
models, how can users choose among them? A recent ML
technique enables domain experts and data scientists to
generate a complete Rashomon set for sparse decision trees-a
huge set of almost-optimal inter-pretable ML models. To help
ML practitioners identify models with desirable properties
from this Rashomon set, we develop Tim-bertrek, the first
interactive visualization system that summarizes thousands
of sparse decision trees at scale. Two usage scenarios
high-light how Timbertrek can empower users to easily
explore, compare, and curate models that align with their
domain knowledge and values. Our open-source tool runs
directly in users' computational notebooks and web browsers,
lowering the barrier to creating more responsible ML models.
Timbertrek is available at the following public demo link:
https: //poloclub. github. io/timbertrek.},
Doi = {10.1109/VIS54862.2022.00021},
Key = {fds368882}
}
@article{fds371610,
Author = {Liu, J and Zhong, C and Li, B and Seltzer, M and Rudin,
C},
Title = {FasterRisk: Fast and Accurate Interpretable Risk
Scores},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {35},
Year = {2022},
Month = {January},
ISBN = {9781713871088},
Abstract = {Over the last century, risk scores have been the most
popular form of predictive model used in healthcare and
criminal justice. Risk scores are sparse linear models with
integer coefficients; often these models can be memorized or
placed on an index card. Typically, risk scores have been
created either without data or by rounding logistic
regression coefficients, but these methods do not reliably
produce high-quality risk scores. Recent work used
mathematical programming, which is computationally slow. We
introduce an approach for efficiently producing a collection
of high-quality risk scores learned from data. Specifically,
our approach produces a pool of almost-optimal sparse
continuous solutions, each with a different support set,
using a beam-search algorithm. Each of these continuous
solutions is transformed into a separate risk score through
a “star ray” search, where a range of multipliers are
considered before rounding the coefficients sequentially to
maintain low logistic loss. Our algorithm returns all of
these high-quality risk scores for the user to consider.
This method completes within minutes and can be valuable in
a broad variety of applications.},
Key = {fds371610}
}
@article{fds371611,
Author = {Lobo, E and Singh, H and Petrik, M and Rudin, C and Lakkaraju,
H},
Title = {Data Poisoning Attacks on Off-Policy Policy Evaluation
Methods},
Journal = {Proceedings of the 38th Conference on Uncertainty in
Artificial Intelligence, UAI 2022},
Pages = {1264-1274},
Year = {2022},
Month = {January},
ISBN = {9781713863298},
Abstract = {Off-policy Evaluation (OPE) methods are a crucial tool for
evaluating policies in high-stakes domains such as
healthcare, where exploration is often infeasible,
unethical, or expensive. However, the extent to which such
methods can be trusted under adversarial threats to data
quality is largely unexplored. In this work, we make the
first attempt at investigating the sensitivity of OPE
methods to marginal adversarial perturbations to the data.
We design a generic data poisoning attack framework
leveraging influence functions from robust statistics to
carefully construct perturbations that maximize error in the
policy value estimates. We carry out extensive
experimentation with multiple healthcare and control
datasets. Our results demonstrate that many existing OPE
methods are highly prone to generating value estimates with
large errors when subject to data poisoning attacks, even
for small adversarial perturbations. These findings question
the reliability of policy values derived using OPE methods
and motivate the need for developing OPE methods that are
statistically robust to train-time data poisoning
attacks.},
Key = {fds371611}
}
@article{fds371612,
Author = {Xin, R and Zhong, C and Chen, Z and Takagi, T and Seltzer, M and Rudin,
C},
Title = {Exploring the Whole Rashomon Set of Sparse Decision
Trees.},
Journal = {Advances in neural information processing
systems},
Volume = {35},
Pages = {14071-14084},
Year = {2022},
Month = {January},
ISBN = {9781713871088},
Abstract = {In any given machine learning problem, there might be many
models that explain the data almost equally well. However,
most learning algorithms return only one of these models,
leaving practitioners with no practical way to explore
alternative models that might have desirable properties
beyond what could be expressed by a loss function. The
<i>Rashomon set</i> is the set of these all almost-optimal
models. Rashomon sets can be large in size and complicated
in structure, particularly for highly nonlinear function
classes that allow complex interaction terms, such as
decision trees. We provide the first technique for
completely enumerating the Rashomon set for sparse decision
trees; in fact, our work provides the first complete
enumeration of any Rashomon set for a non-trivial problem
with a highly nonlinear discrete function class. This allows
the user an unprecedented level of control over model choice
among all models that are approximately equally good. We
represent the Rashomon set in a specialized data structure
that supports efficient querying and sampling. We show three
applications of the Rashomon set: 1) it can be used to study
variable importance for the set of almost-optimal trees (as
opposed to a single tree), 2) the Rashomon set for accuracy
enables enumeration of the Rashomon sets for balanced
accuracy and F1-score, and 3) the Rashomon set for a full
dataset can be used to produce Rashomon sets constructed
with only subsets of the data set. Thus, we are able to
examine Rashomon sets across problems with a new lens,
enabling users to choose models rather than be at the mercy
of an algorithm that produces only a single
model.},
Key = {fds371612}
}
@article{fds373004,
Author = {Lobo, E and Singh, H and Petrik, M and Rudin, C and Lakkaraju,
H},
Title = {Data Poisoning Attacks on Off-Policy Policy Evaluation
Methods},
Journal = {Proceedings of Machine Learning Research},
Volume = {180},
Pages = {1264-1274},
Year = {2022},
Month = {January},
Abstract = {Off-policy Evaluation (OPE) methods are a crucial tool for
evaluating policies in high-stakes domains such as
healthcare, where exploration is often infeasible,
unethical, or expensive. However, the extent to which such
methods can be trusted under adversarial threats to data
quality is largely unexplored. In this work, we make the
first attempt at investigating the sensitivity of OPE
methods to marginal adversarial perturbations to the data.
We design a generic data poisoning attack framework
leveraging influence functions from robust statistics to
carefully construct perturbations that maximize error in the
policy value estimates. We carry out extensive
experimentation with multiple healthcare and control
datasets. Our results demonstrate that many existing OPE
methods are highly prone to generating value estimates with
large errors when subject to data poisoning attacks, even
for small adversarial perturbations. These findings question
the reliability of policy values derived using OPE methods
and motivate the need for developing OPE methods that are
statistically robust to train-time data poisoning
attacks.},
Key = {fds373004}
}
@article{fds359953,
Author = {Guo, Z and Ding, C and Hu, X and Rudin, C},
Title = {A supervised machine learning semantic segmentation approach
for detecting artifacts in plethysmography signals from
wearables.},
Journal = {Physiological measurement},
Volume = {42},
Number = {12},
Year = {2021},
Month = {December},
url = {http://dx.doi.org/10.1088/1361-6579/ac3b3d},
Abstract = {<i>Objective</i>. Wearable devices equipped with
plethysmography (PPG) sensors provided a low-cost, long-term
solution to early diagnosis and continuous screening of
heart conditions. However PPG signals collected from such
devices often suffer from corruption caused by artifacts.
The objective of this study is to develop an effective
supervised algorithm to locate the regions of artifacts
within PPG signals.<i>Approach</i>. We treat artifact
detection as a 1D segmentation problem. We solve it via a
novel combination of an active-contour-based loss and an
adapted U-Net architecture. The proposed algorithm was
trained on the PPG DaLiA training set, and further evaluated
on the PPG DaLiA testing set, WESAD dataset and TROIKA
dataset.<i>Main results</i>. We evaluated with the DICE
score, a well-established metric for segmentation accuracy
evaluation in the field of computer vision. The proposed
method outperforms baseline methods on all three datasets by
a large margin (≈7 percentage points above the next best
method). On the PPG DaLiA testing set, WESAD dataset and
TROIKA dataset, the proposed method achieved 0.8734 ±
0.0018, 0.9114 ± 0.0033 and 0.8050 ± 0.0116 respectively.
The next best method only achieved 0.8068 ± 0.0014, 0.8446
± 0.0013 and 0.7247 ± 0.0050.<i>Significance</i>. The
proposed method is able to pinpoint exact locations of
artifacts with high precision; in the past, we had only a
binary classification of whether a PPG signal has good or
poor quality. This more nuanced information will be critical
to further inform the design of algorithms to detect cardiac
arrhythmia.},
Doi = {10.1088/1361-6579/ac3b3d},
Key = {fds359953}
}
@article{fds361182,
Author = {Barnett, AJ and Schwartz, FR and Tao, C and Chen, C and Ren, Y and Lo, JY and Rudin, C},
Title = {A case-based interpretable deep learning model for
classification of mass lesions in digital
mammography},
Journal = {Nature Machine Intelligence},
Volume = {3},
Number = {12},
Pages = {1061-1070},
Year = {2021},
Month = {December},
url = {http://dx.doi.org/10.1038/s42256-021-00423-x},
Abstract = {Interpretability in machine learning models is important in
high-stakes decisions such as whether to order a biopsy
based on a mammographic exam. Mammography poses important
challenges that are not present in other computer vision
tasks: datasets are small, confounding information is
present and it can be difficult even for a radiologist to
decide between watchful waiting and biopsy based on a
mammogram alone. In this work we present a framework for
interpretable machine learning-based mammography. In
addition to predicting whether a lesion is malignant or
benign, our work aims to follow the reasoning processes of
radiologists in detecting clinically relevant semantic
features of each image, such as the characteristics of the
mass margins. The framework includes a novel interpretable
neural network algorithm that uses case-based reasoning for
mammography. Our algorithm can incorporate a combination of
data with whole image labelling and data with pixel-wise
annotations, leading to better accuracy and interpretability
even with a small number of images. Our interpretable models
are able to highlight the classification-relevant parts of
the image, whereas other methods highlight healthy tissue
and confounding information. Our models are decision
aids—rather than decision makers—and aim for better
overall human–machine collaboration. We do not observe a
loss in mass margin classification accuracy over a black box
neural network trained on the same data.},
Doi = {10.1038/s42256-021-00423-x},
Key = {fds361182}
}
@article{fds359613,
Author = {Coker, B and Rudin, C and King, G},
Title = {A Theory of Statistical Inference for Ensuring the
Robustness of Scientific Results},
Journal = {Management Science},
Volume = {67},
Number = {10},
Pages = {6174-6197},
Year = {2021},
Month = {October},
url = {http://dx.doi.org/10.1287/mnsc.2020.3818},
Abstract = {Inference is the process of using facts we know to learn
about facts we do not know. A theory of inference gives
assumptions necessary to get from the former to the latter,
along with a definition for and summary of the resulting
uncertainty. Any one theory of inference is neither right
nor wrong but merely an axiom that may or may not be useful.
Each of the many diverse theories of inference can be
valuable for certain applications. However, no existing
theory of inference addresses the tendency to choose, from
the range of plausible data analysis specifications
consistent with prior evidence, those that inadvertently
favor one’s own hypotheses. Because the biases from these
choices are a growing concern across scientific fields, and
in a sense the reason the scientific community was invented
in the first place, we introduce a new theory of inference
designed to address this critical problem. We introduce
hacking intervals, which are the range of a summary
statistic one may obtain given a class of possible
endogenous manipulations of the data. Hacking intervals
require no appeal to hypothetical data sets drawn from
imaginary superpopulations. A scientific result with a small
hacking interval is more robust to researcher manipulation
than one with a larger interval and is often easier to
interpret than a classical confidence interval. Some
versions of hacking intervals turn out to be equivalent to
classical confidence intervals, which means they may also
provide a more intuitive and potentially more useful
interpretation of classical confidence intervals.},
Doi = {10.1287/mnsc.2020.3818},
Key = {fds359613}
}
@article{fds358740,
Author = {Afnan, MAM and Rudin, C and Conitzer, V and Savulescu, J and Mishra, A and Liu, Y and Afnan, M},
Title = {Ethical Implementation of Artificial Intelligence to Select
Embryos in in Vitro Fertilization},
Journal = {AIES 2021 - Proceedings of the 2021 AAAI/ACM Conference on
AI, Ethics, and Society},
Pages = {316-326},
Year = {2021},
Month = {July},
ISBN = {9781450384735},
url = {http://dx.doi.org/10.1145/3461702.3462589},
Abstract = {AI has the potential to revolutionize many areas of
healthcare. Radiology, dermatology, and ophthalmology are
some of the areas most likely to be impacted in the near
future, and they have received significant attention from
the broader research community. But AI techniques are now
also starting to be used in in vitro fertilization (IVF), in
particular for selecting which embryos to transfer to the
woman. The contribution of AI to IVF is potentially
significant, but must be done carefully and transparently,
as the ethical issues are significant, in part because this
field involves creating new people. We first give a brief
introduction to IVF and review the use of AI for embryo
selection. We discuss concerns with the interpretation of
the reported results from scientific and practical
perspectives. We then consider the broader ethical issues
involved. We discuss in detail the problems that result from
the use of black-box methods in this context and advocate
strongly for the use of interpretable models. Importantly,
there have been no published trials of clinical
effectiveness, a problem in both the AI and IVF communities,
and we therefore argue that clinical implementation at this
point would be premature. Finally, we discuss ways for the
broader AI community to become involved to ensure
scientifically sound and ethically responsible development
of AI in IVF.},
Doi = {10.1145/3461702.3462589},
Key = {fds358740}
}
@article{fds359994,
Author = {Wang, J and Zhang, X and Zhou, Y and Suh, C and Rudin,
C},
Title = {There once was a really bad poet, it was automated but you
didn’t know it},
Journal = {Transactions of the Association for Computational
Linguistics},
Volume = {9},
Pages = {605-620},
Year = {2021},
Month = {July},
url = {http://dx.doi.org/10.1162/tacl_a_00387},
Abstract = {Limerick generation exemplifies some of the most difficult
challenges faced in poetry generation, as the poems must
tell a story in only five lines, with constraints on rhyme,
stress, and meter. To address these challenges, we introduce
LimGen, a novel and fully automated system for limerick
generation that outperforms state-of-the-art neural
network-based poetry models, as well as prior rule-based
poetry models. LimGen consists of three important pieces:
the Adaptive Multi-Templated Constraint algorithm that
constrains our search to the space of realistic poems, the
Multi-Templated Beam Search algorithm which searches
efficiently through the space, and the probabilistic
Storyline algorithm that provides coherent storylines
related to a user-provided prompt word. The resulting
limericks satisfy poetic constraints and have thematically
coherent storylines, which are sometimes even funny (when we
are lucky).},
Doi = {10.1162/tacl_a_00387},
Key = {fds359994}
}
@article{fds361664,
Author = {Barnett, AJ and Schwartz, FR and Tao, C and Chen, C and Ren, Y and Lo, JY and Rudin, C},
Title = {IAIA-BL: A Case-based Interpretable Deep Learning Model for
Classification of Mass Lesions in Digital
Mammography},
Year = {2021},
Month = {March},
Abstract = {Interpretability in machine learning models is important in
high-stakes decisions, such as whether to order a biopsy
based on a mammographic exam. Mammography poses important
challenges that are not present in other computer vision
tasks: datasets are small, confounding information is
present, and it can be difficult even for a radiologist to
decide between watchful waiting and biopsy based on a
mammogram alone. In this work, we present a framework for
interpretable machine learning-based mammography. In
addition to predicting whether a lesion is malignant or
benign, our work aims to follow the reasoning processes of
radiologists in detecting clinically relevant semantic
features of each image, such as the characteristics of the
mass margins. The framework includes a novel interpretable
neural network algorithm that uses case-based reasoning for
mammography. Our algorithm can incorporate a combination of
data with whole image labelling and data with pixel-wise
annotations, leading to better accuracy and interpretability
even with a small number of images. Our interpretable models
are able to highlight the classification-relevant parts of
the image, whereas other methods highlight healthy tissue
and confounding information. Our models are decision aids,
rather than decision makers, aimed at better overall
human-machine collaboration. We do not observe a loss in
mass margin classification accuracy over a black box neural
network trained on the same data.},
Key = {fds361664}
}
@article{fds359954,
Author = {Gupta, NR and Orlandi, V and Chang, C-R and Wang, T and Morucci, M and Dey,
P and Howell, TJ and Sun, X and Ghosal, A and Roy, S and Rudin, C and Volfovsky, A},
Title = {dame-flame: A Python Library Providing Fast Interpretable
Matching for Causal Inference},
Volume = {abs/2101.01867},
Year = {2021},
Month = {January},
Abstract = {dame-flame is a Python package for performing matching for
observational causal inference on datasets containing
discrete covariates. This package implements the Dynamic
Almost Matching Exactly (DAME) and Fast Large-Scale Almost
Matching Exactly (FLAME) algorithms, which match treatment
and control units on subsets of the covariates. The
resulting matched groups are interpretable, because the
matches are made on covariates, and high-quality, because
machine learning is used to determine which covariates are
important to match on. DAME solves an optimization problem
that matches units on as many covariates as possible,
prioritizing matches on important covariates. FLAME
approximates the solution found by DAME via a much faster
backward feature selection procedure. The package provides
several adjustable parameters to adapt the algorithms to
specific applications, and can calculate treatment effect
estimates after matching. Descriptions of these parameters,
details on estimating treatment effects, and further
examples, can be found in the documentation at
https://almost-matching-exactly.github.io/DAME-FLAME-Python-Package/},
Key = {fds359954}
}
@article{fds359568,
Author = {Wang, Y and Huang, H and Rudin, C and Shaposhnik,
Y},
Title = {Understanding how dimension reduction tools work: An
empirical approach to deciphering T-SNE, UMAP, TriMap, and
PaCMAP for data visualization},
Journal = {Journal of Machine Learning Research},
Volume = {22},
Year = {2021},
Month = {January},
Abstract = {Dimension reduction (DR) techniques such as t-SNE, UMAP, and
TriMap have demonstrated impressive visualization
performance on many real-world datasets. One tension that
has always faced these methods is the trade-off between
preservation of global structure and preservation of local
structure: these methods can either handle one or the other,
but not both. In this work, our main goal is to understand
what aspects of DR methods are important for preserving both
local and global structure: it is difficult to design a
better method without a true understanding of the choices we
make in our algorithms and their empirical impact on the
low-dimensional embeddings they produce. Towards the goal of
local structure preservation, we provide several useful
design principles for DR loss functions based on our new
understanding of the mechanisms behind successful DR
methods. Towards the goal of global structure preservation,
our analysis illuminates that the choice of which components
to preserve is important. We leverage these insights to
design a new algorithm for DR, called Pairwise Controlled
Manifold Approximation Projection (PaCMAP), which preserves
both local and global structure. Our work provides several
unexpected insights into what design choices both to make
and avoid when constructing DR algorithms.},
Key = {fds359568}
}
@article{fds346814,
Author = {Wang, T and Morucci, M and Awan, MU and Liu, Y and Roy, S and Rudin, C and Volfovsky, A},
Title = {FLAME: A fast large-scale almost matching exactly approach
to causal inference},
Journal = {Journal of Machine Learning Research},
Volume = {22},
Pages = {31:1-31:1},
Year = {2021},
Month = {January},
Abstract = {A classical problem in causal inference is that of matching,
where treatment units need to be matched to control units
based on covariate information. In this work, we propose a
method that computes high quality almost-exact matches for
high-dimensional categorical datasets. This method, called
FLAME (Fast Large-scale Almost Matching Exactly), learns a
distance metric for matching using a hold-out training data
set. In order to perform matching efficiently for large
datasets, FLAME leverages techniques that are natural for
query processing in the area of database management, and two
implementations of FLAME are provided: The first uses SQL
queries and the second uses bit-vector techniques. The
algorithm starts by constructing matches of the highest
quality (exact matches on all covariates), and successively
eliminates variables in order to match exactly on as many
variables as possible, while still maintaining interpretable
high-quality matches and balance between treatment and
control groups. We leverage these high quality matches to
estimate conditional average treatment effects (CATEs). Our
experiments show that FLAME scales to huge datasets with
millions of observations where existing state-of-the-art
methods fail, and that it achieves significantly better
performance than other matching methods. © 2021 Tianyu
Wang, Marco Morucci, M. Usaid Awan, Yameng Liu, Sudeepa Roy,
Cynthia Rudin, Alexander Volfovsky.},
Key = {fds346814}
}
@article{fds356437,
Author = {Traca, S and Rudin, C and Yan, W},
Title = {Regulating greed over time in multi-armed
bandits},
Journal = {Journal of Machine Learning Research},
Volume = {22},
Year = {2021},
Month = {January},
Abstract = {In retail, there are predictable yet dramatic time-dependent
patterns in customer behavior, such as periodic changes in
the number of visitors, or increases in customers just
before major holidays. The current paradigm of multi-armed
bandit analysis does not take these known patterns into
account. This means that for applications in retail, where
prices are fixed for periods of time, current bandit
algorithms will not suffice. This work provides a remedy
that takes the time-dependent patterns into account, and we
show how this remedy is implemented for the UCB,
ε-greedy, and UCB-L algorithms, and also through a new
policy called the variable arm pool algorithm. In the
corrected methods, exploitation (greed) is regulated over
time, so that more exploitation occurs during higher reward
periods, and more exploration occurs in periods of low
reward. In order to understand why regret is reduced with
the corrected methods, we present a set of bounds that
provide insight into why we would want to exploit during
periods of high reward, and discuss the impact on regret.
Our proposed methods perform well in experiments, and were
inspired by a high-scoring entry in the Exploration and
Exploitation 3 contest using data from Yahoo! Front Page.
That entry heavily used time-series methods to regulate
greed over time, which was substantially more effective than
other contextual bandit methods. © 2021 Stefano Traca,
Cynthia Rudin, Weiyu Yan.},
Key = {fds356437}
}
@article{fds357549,
Author = {Koyyalagunta, D and Sun, A and Draelos, RL and Rudin,
C},
Title = {Playing codenames with language graphs and word
embeddings},
Journal = {Journal of Artificial Intelligence Research},
Volume = {71},
Pages = {319-346},
Year = {2021},
Month = {January},
url = {http://dx.doi.org/10.1613/jair.1.12665},
Abstract = {Although board games and video games have been studied for
decades in artificial intelligence research, challenging
word games remain relatively unexplored. Word games are not
as constrained as games like chess or poker. Instead, word
game strategy is defined by the players' understanding of
the way words relate to each other. The word game Codenames
provides a unique opportunity to investigate common sense
understanding of relationships between words, an important
open challenge. We propose an algorithm that can generate
Codenames clues from the language graph BabelNet or from any
of several embedding methods - word2vec, GloVe, fastText or
BERT. We introduce a new scoring function that measures the
quality of clues, and we propose a weighting term called
DETECT that incorporates dictionary-based word
representations and document frequency to improve clue
selection. We develop BabelNet-Word Selection Framework
(BabelNet-WSF) to improve BabelNet clue quality and overcome
the computational barriers that previously prevented
leveraging language graphs for Codenames. Extensive
experiments with human evaluators demonstrate that our
proposed innovations yield state-of-the-art performance,
with up to 102.8% improvement in precision@2 in some cases.
Overall, this work advances the formal study of word games
and approaches for common sense language
understanding.},
Doi = {10.1613/jair.1.12665},
Key = {fds357549}
}
@article{fds373532,
Author = {Afnan, MAM and Liu, Y and Conitzer, V and Rudin, C and Mishra, A and Savulescu, J and Afnan, M},
Title = {Interpretable, not black-box, artificial intelligence should
be used for embryo selection.},
Journal = {Human reproduction open},
Volume = {2021},
Number = {4},
Pages = {hoab040},
Year = {2021},
Month = {January},
url = {http://dx.doi.org/10.1093/hropen/hoab040},
Abstract = {Artificial intelligence (AI) techniques are starting to be
used in IVF, in particular for selecting which embryos to
transfer to the woman. AI has the potential to process
complex data sets, to be better at identifying subtle but
important patterns, and to be more objective than humans
when evaluating embryos. However, a current review of the
literature shows much work is still needed before AI can be
ethically implemented for this purpose. No randomized
controlled trials (RCTs) have been published, and the
efficacy studies which exist demonstrate that algorithms can
broadly differentiate well between 'good-' and 'poor-'
quality embryos but not necessarily between embryos of
similar quality, which is the actual clinical need. Almost
universally, the AI models were opaque ('black-box') in that
at least some part of the process was uninterpretable. This
gives rise to a number of epistemic and ethical concerns,
including problems with trust, the possibility of using
algorithms that generalize poorly to different populations,
adverse economic implications for IVF clinics, potential
misrepresentation of patient values, broader societal
implications, a responsibility gap in the case of poor
selection choices and introduction of a more paternalistic
decision-making process. Use of interpretable models, which
are constrained so that a human can easily understand and
explain them, could overcome these concerns. The
contribution of AI to IVF is potentially significant, but we
recommend that AI models used in this field should be
interpretable, and rigorously evaluated with RCTs before
implementation. We also recommend long-term follow-up of
children born after AI for embryo selection, regulatory
oversight for implementation, and public availability of
data and code to enable research teams to independently
reproduce and validate existing models.},
Doi = {10.1093/hropen/hoab040},
Key = {fds373532}
}
@article{fds354204,
Author = {Chen, Z and Bei, Y and Rudin, C},
Title = {Concept whitening for interpretable image
recognition},
Journal = {Nature Machine Intelligence},
Volume = {2},
Number = {12},
Pages = {772-782},
Year = {2020},
Month = {December},
url = {http://dx.doi.org/10.1038/s42256-020-00265-z},
Abstract = {What does a neural network encode about a concept as we
traverse through the layers? Interpretability in machine
learning is undoubtedly important, but the calculations of
neural networks are very challenging to understand. Attempts
to see inside their hidden layers can be misleading,
unusable or rely on the latent space to possess properties
that it may not have. Here, rather than attempting to
analyse a neural network post hoc, we introduce a mechanism,
called concept whitening (CW), to alter a given layer of the
network to allow us to better understand the computation
leading up to that layer. When a concept whitening module is
added to a convolutional neural network, the latent space is
whitened (that is, decorrelated and normalized) and the axes
of the latent space are aligned with known concepts of
interest. By experiment, we show that CW can provide us with
a much clearer understanding of how the network gradually
learns concepts over layers. CW is an alternative to a batch
normalization layer in that it normalizes, and also
decorrelates (whitens), the latent space. CW can be used in
any layer of the network without hurting predictive
performance.},
Doi = {10.1038/s42256-020-00265-z},
Key = {fds354204}
}
@article{fds354205,
Author = {Dong, J and Rudin, C},
Title = {Exploring the cloud of variable importance for the set of
all good models},
Journal = {Nature Machine Intelligence},
Volume = {2},
Number = {12},
Pages = {810-824},
Year = {2020},
Month = {December},
url = {http://dx.doi.org/10.1038/s42256-020-00264-0},
Abstract = {Variable importance is central to scientific studies,
including the social sciences and causal inference,
healthcare and other domains. However, current notions of
variable importance are often tied to a specific predictive
model. This is problematic: what if there were multiple
well-performing predictive models, and a specific variable
is important to some of them but not to others? In that
case, we cannot tell from a single well-performing model if
a variable is always important, sometimes important, never
important or perhaps only important when another variable is
not important. Ideally, we would like to explore variable
importance for all approximately equally accurate predictive
models within the same model class. In this way, we can
understand the importance of a variable in the context of
other variables, and for many good models. This work
introduces the concept of a variable importance cloud, which
maps every variable to its importance for every good
predictive model. We show properties of the variable
importance cloud and draw connections to other areas of
statistics. We introduce variable importance diagrams as a
projection of the variable importance cloud into two
dimensions for visualization purposes. Experiments with
criminal justice, marketing data and image classification
tasks illustrate how variables can change dramatically in
importance for approximately equally accurate predictive
models.},
Doi = {10.1038/s42256-020-00264-0},
Key = {fds354205}
}
@article{fds359959,
Author = {Huang, Q and Zhou, Y and Du, X and Chen, R and Wang, J and Rudin, C and Bartesaghi, A},
Title = {Cryo-ZSSR: multiple-image super-resolution based on deep
internal learning},
Volume = {abs/2011.11020},
Year = {2020},
Month = {November},
Abstract = {Single-particle cryo-electron microscopy (cryo-EM) is an
emerging imaging modality capable of visualizing proteins
and macro-molecular complexes at near-atomic resolution. The
low electron-doses used to prevent sample radiation damage,
result in images where the power of the noise is 100 times
greater than the power of the signal. To overcome the
low-SNRs, hundreds of thousands of particle projections
acquired over several days of data collection are averaged
in 3D to determine the structure of interest. Meanwhile,
recent image super-resolution (SR) techniques based on
neural networks have shown state of the art performance on
natural images. Building on these advances, we present a
multiple-image SR algorithm based on deep internal learning
designed specifically to work under low-SNR conditions. Our
approach leverages the internal image statistics of cryo-EM
movies and does not require training on ground-truth data.
When applied to a single-particle dataset of apoferritin, we
show that the resolution of 3D structures obtained from SR
micrographs can surpass the limits imposed by the imaging
system. Our results indicate that the combination of low
magnification imaging with image SR has the potential to
accelerate cryo-EM data collection without sacrificing
resolution.},
Key = {fds359959}
}
@article{fds353866,
Author = {Wang, T and Ye, W and Geng, D and Rudin, C},
Title = {Towards Practical Lipschitz Bandits},
Journal = {FODS 2020 - Proceedings of the 2020 ACM-IMS Foundations of
Data Science Conference},
Pages = {129-138},
Year = {2020},
Month = {October},
ISBN = {9781450381031},
url = {http://dx.doi.org/10.1145/3412815.3416885},
Abstract = {Stochastic Lipschitz bandit algorithms balance exploration
and exploitation, and have been used for a variety of
important task domains. In this paper, we present a
framework for Lipschitz bandit methods that adaptively
learns partitions of context-and arm-space. Due to this
flexibility, the algorithm is able to efficiently optimize
rewards and minimize regret, by focusing on the portions of
the space that are most relevant. In our analysis, we link
tree-based methods to Gaussian processes. In light of our
analysis, we design a novel hierarchical Bayesian model for
Lipschitz bandit problems. Our experiments show that our
algorithms can achieve state-of-the-art performance in
challenging real-world tasks such as neural network
hyperparameter tuning.},
Doi = {10.1145/3412815.3416885},
Key = {fds353866}
}
@article{fds359955,
Author = {Rich, AS and Rudin, C and Jacoby, DMP and Freeman, R and Wearn, OR and Shevlin, H and Dihal, K and ÓhÉigeartaigh, SS and Butcher, J and Lippi, M and Palka, P and Torroni, P and Wongvibulsin, S and Begoli, E and Schneider, G and Cave, S and Sloane, M and Moss, E and Rahwan, I and Goldberg, K and Howard, D and Floridi, L and Stilgoe,
J},
Title = {AI reflections in 2019},
Journal = {Nature Machine Intelligence},
Volume = {2},
Number = {1},
Pages = {2-9},
Publisher = {Springer Science and Business Media LLC},
Year = {2020},
Month = {January},
url = {http://dx.doi.org/10.1038/s42256-019-0141-1},
Doi = {10.1038/s42256-019-0141-1},
Key = {fds359955}
}
@article{fds359958,
Author = {Morucci, M and Orlandi, V and Rudin, C and Roy, S and Volfovsky,
A},
Title = {Adaptive Hyper-box Matching for Interpretable Individualized
Treatment Effect Estimation},
Journal = {Proceedings of Machine Learning Research},
Volume = {124},
Pages = {1089-1098},
Publisher = {AUAI Press},
Editor = {Adams, RP and Gogate, V},
Year = {2020},
Month = {January},
Abstract = {We propose a matching method for observational data that
matches units with others in unit-specific, hyper-box-shaped
regions of the covariate space. These regions are large
enough that many matches are created for each unit and small
enough that the treatment effect is roughly constant
throughout. The regions are found as either the solution to
a mixed integer program, or using a (fast) approximation
algorithm. The result is an interpretable and tailored
estimate of the causal effect for each unit.},
Key = {fds359958}
}
@article{fds359960,
Author = {Gregory, H and Li, S and Mohammadi, P and Tarn, N and Draelos, R and Rudin,
C},
Title = {A transformer approach to contextual sarcasm detection in
twitter},
Journal = {Proceedings of the Annual Meeting of the Association for
Computational Linguistics},
Pages = {270-275},
Year = {2020},
Month = {January},
ISBN = {9781952148125},
url = {http://dx.doi.org/10.18653/v1/P17},
Abstract = {Understanding tone in Twitter posts will be increasingly
important as more and more communication moves online. One
of the most difficult, yet important tones to detect is
sarcasm. In the past, LSTM and transformer architecture
models have been used to tackle this problem. We attempt to
expand upon this research, implementing LSTM, GRU, and
transformer models, and exploring new methods to classify
sarcasm in Twitter posts. Among these, the most successful
were transformer models, most notably BERT. While we
attempted a few other models described in this paper, our
most successful model was an ensemble of transformer models
including BERT, RoBERTa, XLNet, RoBERTa-large, and ALBERT.
This research was performed in conjunction with the sarcasm
detection shared task section in the Second Workshop on
Figurative Language Processing, co-located with ACL
2020.},
Doi = {10.18653/v1/P17},
Key = {fds359960}
}
@article{fds359957,
Author = {Awan, MU and Morucci, M and Orlandi, V and Roy, S and Rudin, C and Volfovsky, A},
Title = {Almost-Matching-Exactly for Treatment Effect Estimation
under Network Interference},
Journal = {Proceedings of Machine Learning Research},
Volume = {108},
Pages = {3252-3262},
Publisher = {PMLR},
Editor = {Chiappa, S and Calandra, R},
Year = {2020},
Month = {January},
Abstract = {We propose a matching method that recovers direct treatment
effects from randomized experiments where units are
connected in an observed network, and units that share edges
can potentially influence each others' outcomes. Traditional
treatment effect estimators for randomized experiments are
biased and error prone in this setting. Our method matches
units almost exactly on counts of unique subgraphs within
their neighborhood graphs. The matches that we construct are
interpretable and high-quality. Our method can be extended
easily to accommodate additional unit-level covariate
information. We show empirically that our method performs
better than other existing methodologies for this problem,
while producing meaningful, interpretable
results.},
Key = {fds359957}
}
@article{fds353047,
Author = {Menon, S and Damian, A and Hu, S and Ravi, N and Rudin,
C},
Title = {PULSE: Self-Supervised Photo Upsampling via Latent Space
Exploration of Generative Models},
Journal = {Proceedings of the IEEE Computer Society Conference on
Computer Vision and Pattern Recognition},
Pages = {2434-2442},
Year = {2020},
Month = {January},
url = {http://dx.doi.org/10.1109/CVPR42600.2020.00251},
Abstract = {The primary aim of single-image super-resolution is to
construct a high-resolution (HR) image from a corresponding
low-resolution (LR) input. In previous approaches, which
have generally been supervised, the training objective
typically measures a pixel-wise average distance between the
super-resolved (SR) and HR images. Optimizing such metrics
often leads to blurring, especially in high variance
(detailed) regions. We propose an alternative formulation of
the super-resolution problem based on creating realistic SR
images that downscale correctly. We present a novel
super-resolution algorithm addressing this problem, PULSE
(Photo Upsampling via Latent Space Exploration), which
generates high-resolution, realistic images at resolutions
previously unseen in the literature. It accomplishes this in
an entirely self-supervised fashion and is not confined to a
specific degradation operator used during training, unlike
previous methods (which require training on databases of
LR-HR image pairs for supervised learning). Instead of
starting with the LR image and slowly adding detail, PULSE
traverses the high-resolution natural image manifold,
searching for images that downscale to the original LR
image. This is formalized through the 'downscaling loss,'
which guides exploration through the latent space of a
generative model. By leveraging properties of
high-dimensional Gaussians, we restrict the search space to
guarantee that our outputs are realistic. PULSE thereby
generates super-resolved images that both are realistic and
downscale correctly. We show extensive experimental results
demonstrating the efficacy of our approach in the domain of
face super-resolution (also known as face hallucination).
Our method outperforms state-of-the-art methods in
perceptual quality at higher resolutions and scale factors
than previously possible.},
Doi = {10.1109/CVPR42600.2020.00251},
Key = {fds353047}
}
@article{fds356438,
Author = {Wang, T and Rudin, C},
Title = {Bandits for bmo functions},
Journal = {37th International Conference on Machine Learning, ICML
2020},
Volume = {PartF168147-13},
Pages = {9938-9948},
Year = {2020},
Month = {January},
ISBN = {9781713821120},
Abstract = {We study the bandit problem where the underlying expected
reward is a Bounded Mean Oscillation (BMO) function. BMO
functions are allowed to be discontinuous and unbounded, and
are useful in modeling signals with infinities in the
domain. We develop a toolset for BMO bandits, and provide an
algorithm that can achieve poly-log-regret a regret measured
against an arm that is optimal after removing a-sized
portion of the arm space.},
Key = {fds356438}
}
@article{fds356439,
Author = {Lin, J and Zhong, C and Hu, D and Rudin, C and Seltzer,
M},
Title = {Generalized and scalable optimal sparse decision
trees},
Journal = {37th International Conference on Machine Learning, ICML
2020},
Volume = {PartF168147-8},
Pages = {6106-6116},
Year = {2020},
Month = {January},
ISBN = {9781713821120},
Abstract = {Decision tree optimization is notoriously difficult from a
computational perspective but essential for the field of
interpretable machine learning. Despite efforts over the
past 40 years, only recently have optimization breakthroughs
been made that have allowed practical algorithms to find
optimal decision trees. These new techniques have the
potential to trigger a paradigm shift where it is possible
to construct sparse decision trees to efficiently optimize a
variety of objective functions without relying on greedy
splitting and pruning heuristics that often lead to
suboptimal solutions. The contribution in this work is to
provide a general framework for decision tree optimization
that addresses the two significant open problems in the
area: treatment of imbalanced data and fully optimizing over
continuous variables. We present techniques that produce
optimal decision trees over a variety of objectives
including F-score, AUC, and partial area under the ROC
convex hull. We also introduce a scalable algorithm that
produces provably optimal results in the presence of
continuous variables and speeds up decision tree
construction by several orders of magnitude relative to the
stateof- the art.},
Key = {fds356439}
}
@article{fds359956,
Author = {Morucci, M and Orlandi, V and Rudin, C and Roy, S and Volfovsky,
A},
Title = {Adaptive Hyper-box Matching for Interpretable Individualized
Treatment Effect Estimation},
Journal = {CONFERENCE ON UNCERTAINTY IN ARTIFICIAL INTELLIGENCE (UAI
2020)},
Volume = {124},
Pages = {1089-1098},
Year = {2020},
Key = {fds359956}
}
@article{fds349988,
Author = {Awan, MU and Roy, S and Morucci, M and Rudin, C and Orlandi, V and Volfovsky, A},
Title = {Almost-Matching-Exactly for Treatment Effect Estimation
under Network Interference},
Journal = {INTERNATIONAL CONFERENCE ON ARTIFICIAL INTELLIGENCE AND
STATISTICS, VOL 108},
Volume = {108},
Pages = {3252-3261},
Year = {2020},
Key = {fds349988}
}
@article{fds359961,
Author = {Gregory, H and Li, S and Mohammadi, P and Tarn, N and Draelos, R and Rudin,
C},
Title = {A Transformer Approach to Contextual Sarcasm Detection in
Twitter},
Journal = {FIGURATIVE LANGUAGE PROCESSING},
Pages = {270-275},
Year = {2020},
Key = {fds359961}
}
@article{fds346696,
Author = {Wang, F and Rudin, C and Mccormick, TH and Gore, JL},
Title = {Modeling recovery curves with application to
prostatectomy.},
Journal = {Biostatistics (Oxford, England)},
Volume = {20},
Number = {4},
Pages = {549-564},
Year = {2019},
Month = {October},
url = {http://dx.doi.org/10.1093/biostatistics/kxy002},
Abstract = {In many clinical settings, a patient outcome takes the form
of a scalar time series with a recovery curve shape, which
is characterized by a sharp drop due to a disruptive event
(e.g., surgery) and subsequent monotonic smooth rise towards
an asymptotic level not exceeding the pre-event value. We
propose a Bayesian model that predicts recovery curves based
on information available before the disruptive event. A
recovery curve of interest is the quantified sexual function
of prostate cancer patients after prostatectomy surgery. We
illustrate the utility of our model as a pre-treatment
medical decision aid, producing personalized predictions
that are both interpretable and accurate. We uncover
covariate relationships that agree with and supplement that
in existing medical literature.},
Doi = {10.1093/biostatistics/kxy002},
Key = {fds346696}
}
@article{fds346811,
Author = {Rudin, C},
Title = {Do Simpler Models Exist and How Can We Find
Them?},
Journal = {Proceedings of the 25th ACM SIGKDD International Conference
on Knowledge Discovery & Data Mining},
Publisher = {ACM},
Year = {2019},
Month = {July},
ISBN = {9781450362016},
url = {http://dx.doi.org/10.1145/3292500.3330823},
Doi = {10.1145/3292500.3330823},
Key = {fds346811}
}
@article{fds348375,
Author = {Ustun, B and Rudin, C},
Title = {Learning optimized risk scores},
Journal = {Journal of Machine Learning Research},
Volume = {20},
Year = {2019},
Month = {June},
Abstract = {Risk scores are simple classification models that let users
make quick risk predictions by adding and subtracting a few
small numbers. These models are widely used in medicine and
criminal justice, but are difficult to learn from data
because they need to be calibrated, sparse, use small
integer coefficients, and obey application-specific
constraints. In this paper, we introduce a machine learning
method to learn risk scores. We formulate the risk score
problem as a mixed integer nonlinear program, and present a
cutting plane algorithm to recover its optimal solution. We
improve our algorithm with specialized techniques that
generate feasible solutions, narrow the optimality gap, and
reduce data-related computation. Our algorithm can train
risk scores in a way that scales linearly in the number of
samples in a dataset, and that allows practitioners to
address application-specific constraints without parameter
tuning or post-processing. We benchmark the performance of
different methods to learn risk scores on publicly available
datasets, comparing risk scores produced by our method to
risk scores built using methods that are used in practice.
We also discuss the practical benefits of our method through
a real-world application where we build a customized risk
score for ICU seizure prediction in collaboration with the
Massachusetts General Hospital.},
Key = {fds348375}
}
@article{fds344810,
Author = {Rudin, C and Shaposhnik, Y},
Title = {Globally-Consistent Rule-Based Summary-Explanations for
Machine Learning Models: Application to Credit-Risk
Evaluation},
Journal = {https://www.jmlr.org/papers/},
Volume = {24},
Year = {2019},
Month = {May},
Key = {fds344810}
}
@article{fds343582,
Author = {Bravo, F and Rudin, C and Shaposhnik, Y and Yuan,
Y},
Title = {Simple Rules for Predicting Congestion Risk in Queueing
Systems: Application to ICUs},
Year = {2019},
Month = {May},
Key = {fds343582}
}
@article{fds351188,
Author = {Rudin, C},
Title = {Stop Explaining Black Box Machine Learning Models for High
Stakes Decisions and Use Interpretable Models
Instead.},
Journal = {Nature machine intelligence},
Volume = {1},
Number = {5},
Pages = {206-215},
Year = {2019},
Month = {May},
url = {http://dx.doi.org/10.1038/s42256-019-0048-x},
Abstract = {Black box machine learning models are currently being used
for high stakes decision-making throughout society, causing
problems throughout healthcare, criminal justice, and in
other domains. People have hoped that creating methods for
explaining these black box models will alleviate some of
these problems, but trying to <i>explain</i> black box
models, rather than creating models that are
<i>interpretable</i> in the first place, is likely to
perpetuate bad practices and can potentially cause
catastrophic harm to society. There is a way forward - it is
to design models that are inherently interpretable. This
manuscript clarifies the chasm between explaining black
boxes and using inherently interpretable models, outlines
several key reasons why explainable black boxes should be
avoided in high-stakes decisions, identifies challenges to
interpretable machine learning, and provides several example
applications where interpretable models could potentially
replace black box models in criminal justice, healthcare,
and computer vision.},
Doi = {10.1038/s42256-019-0048-x},
Key = {fds351188}
}
@article{fds346809,
Author = {Dieng, A and Liu, Y and Roy, S and Rudin, C and Volfovsky,
A},
Title = {Interpretable Almost-Exact Matching for Causal
Inference.},
Journal = {Proceedings of machine learning research},
Volume = {89},
Pages = {2445-2453},
Year = {2019},
Month = {April},
Abstract = {Matching methods are heavily used in the social and health
sciences due to their interpretability. We aim to create the
highest possible quality of treatment-control matches for
categorical data in the potential outcomes framework. The
method proposed in this work aims to match units on a
weighted Hamming distance, taking into account the relative
importance of the covariates; the algorithm aims to match
units on as many relevant variables as possible. To do this,
the algorithm creates a hierarchy of covariate combinations
on which to match (similar to downward closure), in the
process solving an optimization problem for each unit in
order to construct the optimal matches. The algorithm uses a
single dynamic program to solve all of the units'
optimization problems simultaneously. Notable advantages of
our method over existing matching procedures are its
high-quality interpretable matches, versatility in handling
different data distributions that may have irrelevant
variables, and ability to handle missing data by matching on
as many available covariates as possible.},
Key = {fds346809}
}
@article{fds330649,
Author = {Ban, GY and Rudin, C},
Title = {The big Data newsvendor: Practical insights from machine
learning},
Journal = {Operations Research},
Volume = {67},
Number = {1},
Pages = {90-108},
Year = {2019},
Month = {January},
url = {http://dx.doi.org/10.1287/opre.2018.1757},
Abstract = {We investigate the data-driven newsvendor problem when one
has n observations of p features related to the demand as
well as historical demand data. Rather than a two-step
process of first estimating a demand distribution then
optimizing for the optimal order quantity, we propose
solving the “big data” newsvendor problem via
single-step machine-learning algorithms. Specifically, we
propose algorithms based on the empirical risk minimization
(ERM) principle, with and without regularization, and an
algorithm based on kernel-weights optimization (KO). The ERM
approaches, equivalent to high-dimensional quantile
regression, can be solved by convex optimization problems
and the KO approach by a sorting algorithm. We analytically
justify the use of features by showing that their omission
yields inconsistent decisions. We then derive finite-sample
performance bounds on the out-of-sample costs of the
feature-based algorithms, which quantify the effects of
dimensionality and cost parameters. Our bounds, based on
algorithmic stability theory, generalize known analyses for
the newsvendor problem without feature information. Finally,
we apply the feature-based algorithms for nurse staffing in
a hospital emergency room using a data set from a large UK
teaching hospital and find that (1) the best ERM and KO
algorithms beat the best practice benchmark by 23% and 24%,
respectively, in the out-of-sample cost, and (2) the best KO
algorithm is faster than the best ERM algorithm by three
orders of magnitude and the best practice benchmark by two
orders of magnitude.},
Doi = {10.1287/opre.2018.1757},
Key = {fds330649}
}
@article{fds346810,
Author = {Usaid Awan and M and Liu, Y and Morucci, M and Roy, S and Rudin, C and Volfovsky, A},
Title = {Interpretable almost-matching-exactly with instrumental
variables},
Journal = {35th Conference on Uncertainty in Artificial Intelligence,
UAI 2019},
Year = {2019},
Month = {January},
Abstract = {© 2019 Association For Uncertainty in Artificial
Intelligence (AUAI). All rights reserved. Uncertainty in the
estimation of the causal effect in observational studies is
often due to unmeasured confounding, i.e., the presence of
unobserved covariates linking treatments and outcomes.
Instrumental Variables (IV) are commonly used to reduce the
effects of unmeasured confounding. Existing methods for IV
estimation either require strong parametric assumptions, use
arbitrary distance metrics, or do not scale well to large
datasets. We propose a matching framework for IV in the
presence of observed categorical confounders that addresses
these weaknesses. Our method first matches units exactly,
and then consecutively drops variables to approximately
match the remaining units on as many variables as possible.
We show that our algorithm constructs better matches than
other existing methods on simulated datasets, and we produce
interesting results in an application to political
canvassing.},
Key = {fds346810}
}
@article{fds348374,
Author = {Fisher, A and Rudin, C and Dominici, F},
Title = {All Models are Wrong, but Many are Useful: Learning a
Variable's Importance by Studying an Entire Class of
Prediction Models Simultaneously.},
Journal = {Journal of machine learning research : JMLR},
Volume = {20},
Pages = {177},
Year = {2019},
Month = {January},
Abstract = {Variable importance (VI) tools describe how much covariates
contribute to a prediction model's accuracy. However,
important variables for one well-performing model (for
example, a linear model <i>f</i> (x) = x <sup><i>T</i></sup>
<i>β</i> with a fixed coefficient vector <i>β</i>) may be
unimportant for another model. In this paper, we propose
model class reliance (MCR) as the range of VI values across
<i>all</i> well-performing model in a prespecified class.
Thus, MCR gives a more comprehensive description of
importance by accounting for the fact that many prediction
models, possibly of different parametric forms, may fit the
data well. In the process of deriving MCR, we show several
informative results for permutation-based VI estimates,
based on the VI measures used in Random Forests.
Specifically, we derive connections between permutation
importance estimates for a <i>single</i> prediction model,
U-statistics, conditional variable importance, conditional
causal effects, and linear model coefficients. We then give
probabilistic bounds for MCR, using a novel, generalizable
technique. We apply MCR to a public data set of Broward
County criminal records to study the reliance of recidivism
prediction models on sex and race. In this application, MCR
can be used to help inform VI for unknown, proprietary
models.},
Key = {fds348374}
}
@article{fds372692,
Author = {Parikh, H and Rudin, C and Volfovsky, A},
Title = {An Application of Matching After Learning To Stretch (MALTS)
to the ACIC 2018 Causal Inference Challenge
Data},
Journal = {Observational Studies},
Volume = {5},
Number = {2},
Pages = {118-130},
Year = {2019},
Month = {January},
url = {http://dx.doi.org/10.1353/obs.2019.0006},
Abstract = {In the learning-to-match framework for causal inference, a
parameterized distance metric is trained on a holdout train
set so that the matching yields accurate estimated
conditional average treatment effects. This way, the
matching can be as accurate as other black box machine
learning techniques for causal inference. We use a new
learning-to-match algorithm called Matching-After-Learning-To-Stretch
(MALTS) (Parikh et al., 2018) to study an observational
dataset from the Atlantic Causal Inference Challenge. Other
than pro-viding estimates for (conditional) average
treatment effects, the MALTS procedure allows practitioners
to evaluate matched groups directly, understand where more
data might need to be collected and gain an understanding of
when estimates can be trusted.1.},
Doi = {10.1353/obs.2019.0006},
Key = {fds372692}
}
@article{fds346697,
Author = {Tracà, S and Rudin, C and Yan, W},
Title = {Reducing exploration of dying arms in mortal
bandits},
Journal = {35th Conference on Uncertainty in Artificial Intelligence,
UAI 2019},
Year = {2019},
Month = {January},
Abstract = {© 2019 Air and Waste Management Association. All rights
reserved. Mortal bandits have proven to be extremely useful
for providing news article recommendations, running
automated online advertising campaigns, and for other
applications where the set of available options changes over
time. Previous work on this problem showed how to regulate
exploration of new arms when they have recently appeared,
but they do not adapt when the arms are about to disappear.
Since in most applications we can determine either exactly
or approximately when arms will disappear, we can leverage
this information to improve performance: we should not be
exploring arms that are about to disappear. We provide
adaptations of algorithms, regret bounds, and experiments
for this study, showing a clear benefit from regulating
greed (exploration/exploitation) for arms that will soon
disappear. We illustrate numerical performance on the Yahoo!
Front Page Today Module User Click Log Dataset.},
Key = {fds346697}
}
@article{fds349522,
Author = {Usaid Awan and M and Liu, Y and Morucci, M and Roy, S and Rudin, C and Volfovsky, A},
Title = {Interpretable almost-matching-exactly with instrumental
variables},
Journal = {35th Conference on Uncertainty in Artificial Intelligence,
UAI 2019},
Year = {2019},
Month = {January},
Abstract = {Uncertainty in the estimation of the causal effect in
observational studies is often due to unmeasured
confounding, i.e., the presence of unobserved covariates
linking treatments and outcomes. Instrumental Variables (IV)
are commonly used to reduce the effects of unmeasured
confounding. Existing methods for IV estimation either
require strong parametric assumptions, use arbitrary
distance metrics, or do not scale well to large datasets. We
propose a matching framework for IV in the presence of
observed categorical confounders that addresses these
weaknesses. Our method first matches units exactly, and then
consecutively drops variables to approximately match the
remaining units on as many variables as possible. We show
that our algorithm constructs better matches than other
existing methods on simulated datasets, and we produce
interesting results in an application to political
canvassing.},
Key = {fds349522}
}
@article{fds349523,
Author = {Tracà, S and Rudin, C and Yan, W},
Title = {Reducing exploration of dying arms in mortal
bandits},
Journal = {35th Conference on Uncertainty in Artificial Intelligence,
UAI 2019},
Year = {2019},
Month = {January},
Abstract = {Mortal bandits have proven to be extremely useful for
providing news article recommendations, running automated
online advertising campaigns, and for other applications
where the set of available options changes over time.
Previous work on this problem showed how to regulate
exploration of new arms when they have recently appeared,
but they do not adapt when the arms are about to disappear.
Since in most applications we can determine either exactly
or approximately when arms will disappear, we can leverage
this information to improve performance: we should not be
exploring arms that are about to disappear. We provide
adaptations of algorithms, regret bounds, and experiments
for this study, showing a clear benefit from regulating
greed (exploration/exploitation) for arms that will soon
disappear. We illustrate numerical performance on the Yahoo!
Front Page Today Module User Click Log Dataset.},
Key = {fds349523}
}
@article{fds352221,
Author = {Chen, C and Li, O and Tao, C and Barnett, AJ and Su, J and Rudin,
C},
Title = {This looks like that: Deep learning for interpretable image
recognition},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {32},
Year = {2019},
Month = {January},
Abstract = {When we are faced with challenging image classification
tasks, we often explain our reasoning by dissecting the
image, and pointing out prototypical aspects of one class or
another. The mounting evidence for each of the classes helps
us make our final decision. In this work, we introduce a
deep network architecture - prototypical part network
(ProtoPNet), that reasons in a similar way: the network
dissects the image by finding prototypical parts, and
combines evidence from the prototypes to make a final
classification. The model thus reasons in a way that is
qualitatively similar to the way ornithologists, physicians,
and others would explain to people on how to solve
challenging image classification tasks. The network uses
only image-level labels for training without any annotations
for parts of images. We demonstrate our method on the
CUB-200-2011 dataset and the Stanford Cars dataset. Our
experiments show that ProtoPNet can achieve comparable
accuracy with its analogous non-interpretable counterpart,
and when several ProtoPNets are combined into a larger
network, it can achieve an accuracy that is on par with some
of the best-performing deep models. Moreover, ProtoPNet
provides a level of interpretability that is absent in other
interpretable deep models.},
Key = {fds352221}
}
@article{fds352220,
Author = {Hu, X and Rudin, C and Seltzer, M},
Title = {Optimal sparse decision trees},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {32},
Year = {2019},
Month = {January},
Abstract = {Decision tree algorithms have been among the most popular
algorithms for interpretable (transparent) machine learning
since the early 1980's. The problem that has plagued
decision tree algorithms since their inception is their lack
of optimality, or lack of guarantees of closeness to
optimality: decision tree algorithms are often greedy or
myopic, and sometimes produce unquestionably suboptimal
models. Hardness of decision tree optimization is both a
theoretical and practical obstacle, and even careful
mathematical programming approaches have not been able to
solve these problems efficiently. This work introduces the
first practical algorithm for optimal decision trees for
binary variables. The algorithm is a co-design of analytical
bounds that reduce the search space and modern systems
techniques, including data structures and a custom
bit-vector library. Our experiments highlight advantages in
scalability, speed, and proof of optimality.},
Key = {fds352220}
}
@article{fds373971,
Author = {Hase, P and Chen, C and Li, O and Rudin, C},
Title = {Interpretable Image Recognition with Hierarchical
Prototypes},
Journal = {Proceedings of the AAAI Conference on Human Computation and
Crowdsourcing},
Volume = {7},
Pages = {32-40},
Year = {2019},
Month = {January},
ISBN = {9781577358206},
url = {http://dx.doi.org/10.1609/hcomp.v7i1.5265},
Abstract = {Vision models are interpretable when they classify objects
on the basis of features that a person can directly
understand. Recently, methods relying on visual feature
prototypes have been developed for this purpose. However, in
contrast to how humans categorize objects, these approaches
have not yet made use of any taxonomical organization of
class labels. With such an approach, for instance, we may
see why a chimpanzeeis classified as a chimpanzee, but not
why it was considered to be a primate or even an animal. In
this work we introduce a model that uses hierarchically
organized prototypes to classify objects at every level in a
predefined taxonomy. Hence, we may find distinct
explanations for the prediction animage receives at each
level of the taxonomy. The hierarchical prototypes enable
the model to perform another important task: interpretably
classifying images from previously unseen classes at the
level of the taxonomy to which they correctlyrelate, e.g.
classifying a hand gun as a weapon, when the only weapons in
the training data are rifles. With a subset of Image Net, we
test our model against its counterpart black-box model on
two tasks: 1) classification of data from familiar classes,
and 2) classification of data from previously unseen classes
at the appropriate level in the taxonomy. We find that our
model performs approximately as well as its counterpart
black-box model while allowing for each classification to
beinterpreted.},
Doi = {10.1609/hcomp.v7i1.5265},
Key = {fds373971}
}
@article{fds371425,
Author = {Awan, MU and Liu, Y and Morucci, M and Roy, S and Rudin, C and Volfovsky,
A},
Title = {Interpretable Almost-Matching-Exactly With Instrumental
Variables},
Journal = {Proceedings of Machine Learning Research},
Volume = {115},
Pages = {1116-1126},
Year = {2019},
Month = {January},
Abstract = {Uncertainty in the estimation of the causal effect in
observational studies is often due to unmeasured
confounding, i.e., the presence of unobserved covariates
linking treatments and outcomes. Instrumental Variables (IV)
are commonly used to reduce the effects of unmeasured
confounding. Existing methods for IV estimation either
require strong parametric assumptions, use arbitrary
distance metrics, or do not scale well to large datasets. We
propose a matching framework for IV in the presence of
observed categorical confounders that addresses these
weaknesses. Our method first matches units exactly, and then
consecutively drops variables to approximately match the
remaining units on as many variables as possible. We show
that our algorithm constructs better matches than other
existing methods on simulated datasets, and we produce
interesting results in an application to political
canvassing.},
Key = {fds371425}
}
@article{fds371426,
Author = {Tracà, S and Rudin, C and Yan, W},
Title = {Reducing Exploration of Dying Arms in Mortal
Bandits},
Journal = {Proceedings of Machine Learning Research},
Volume = {115},
Pages = {156-163},
Year = {2019},
Month = {January},
Abstract = {Mortal bandits have proven to be extremely useful for
providing news article recommendations, running automated
online advertising campaigns, and for other applications
where the set of available options changes over time.
Previous work on this problem showed how to regulate
exploration of new arms when they have recently appeared,
but they do not adapt when the arms are about to disappear.
Since in most applications we can determine either exactly
or approximately when arms will disappear, we can leverage
this information to improve performance: we should not be
exploring arms that are about to disappear. We provide
adaptations of algorithms, regret bounds, and experiments
for this study, showing a clear benefit from regulating
greed (exploration/exploitation) for arms that will soon
disappear. We illustrate numerical performance on the Yahoo!
Front Page Today Module User Click Log Dataset.},
Key = {fds371426}
}
@article{fds342581,
Author = {Bei, Y and Damian, A and Hu, S and Menon, S and Ravi, N and Rudin,
C},
Title = {New techniques for preserving global structure and denoising
with low information loss in single-image
super-resolution},
Journal = {IEEE Computer Society Conference on Computer Vision and
Pattern Recognition Workshops},
Volume = {2018-June},
Pages = {987-994},
Year = {2018},
Month = {December},
ISBN = {9781538661000},
url = {http://dx.doi.org/10.1109/CVPRW.2018.00132},
Abstract = {This work identifies and addresses two important technical
challenges in single-image super-resolution: (1) how to
upsample an image without magnifying noise and (2) how to
preserve large scale structure when upsampling. We summarize
the techniques we developed for our second place entry in
Track 1 (Bicubic Downsampling), seventh place entry in Track
2 (Realistic Adverse Conditions), and seventh place entry in
Track 3 (Realistic difficult) in the 2018 NTIRE
Super-Resolution Challenge. Furthermore, we present new
neural network architectures that specifically address the
two challenges listed above: denoising and preservation of
large-scale structure.},
Doi = {10.1109/CVPRW.2018.00132},
Key = {fds342581}
}
@article{fds346812,
Author = {Timofte, R and Gu, S and Wu, J and Van Gool and L and Zhang, L and Yang, MH and Haris, M and Shakhnarovich, G and Ukita, N and Hu, S and Bei, Y and Hui, Z and Jiang, X and Gu, Y and Liu, J and Wang, Y and Perazzi, F and McWilliams, B and Sorkine-Hornung, A and Sorkine-Hornung, O and Schroers, C and Yu, J and Fan, Y and Yang, J and Xu, N and Wang, Z and Wang, X and Huang, TS and Yu, K and Hui, TW and Dong, C and Lin, L and Loy, CC and Park, D and Kim, K and Chun,
SY and Zhang, K and Liu, P and Zuo, W and Guo, S and Xu, J and Liu, Y and Xiong,
F and Dong, Y and Bai, H and Damian, A and Ravi, N and Menon, S and Rudin, C and Seo, J and Jeon, T and Koo, J and Jeon, S and Kim, SY and Choi, JS and Ki, S and Seo, S and Sim, H and Kim, S and Kim, M and Chen, R and Zeng, K and Guo, J and Qu, Y and Li, C and Ahn, N and Kang, B and Sohn, KA and Yuan, Y and Zhang, J and Pang, J and Xu, X and Zhao, Y and Deng, W and Ul Hussain and S and Aadil, M and Rahim, R and Cai, X and Huang, F and Xu, Y and Michelini, PN and Zhu, D and Liu, H and Kim, JH and Lee, JS and Huang, Y and Qiu, M and Jing, L and Zeng,
J and Sharma, M and Mukhopadhyay, R and Upadhyay, A and Koundinya, S and Shukla, A and Chaudhury, S and Zhang, Z and Hu, YH},
Title = {NTIRE 2018 challenge on single image super-resolution:
Methods and results},
Journal = {IEEE Computer Society Conference on Computer Vision and
Pattern Recognition Workshops},
Volume = {2018-June},
Pages = {965-976},
Publisher = {IEEE},
Year = {2018},
Month = {December},
ISBN = {9781538661000},
url = {http://dx.doi.org/10.1109/CVPRW.2018.00130},
Abstract = {This paper reviews the 2nd NTIRE challenge on single image
super-resolution (restoration of rich details in a low
resolution image) with focus on proposed solutions and
results. The challenge had 4 tracks. Track 1 employed the
standard bicubic downscaling setup, while Tracks 2, 3 and 4
had realistic unknown downgrading operators simulating
camera image acquisition pipeline. The operators were
learnable through provided pairs of low and high resolution
train images. The tracks had 145, 114, 101, and 113
registered participants, resp., and 31 teams competed in the
final testing phase. They gauge the state-of-the-art in
single image super-resolution.},
Doi = {10.1109/CVPRW.2018.00130},
Key = {fds346812}
}
@article{fds339230,
Author = {Rudin, C and Ertekin, Ş},
Title = {Learning customized and optimized lists of rules with
mathematical programming},
Journal = {Mathematical Programming Computation},
Volume = {10},
Number = {4},
Pages = {659-702},
Publisher = {Springer Nature America, Inc},
Year = {2018},
Month = {December},
url = {http://dx.doi.org/10.1007/s12532-018-0143-8},
Abstract = {We introduce a mathematical programming approach to building
rule lists, which are a type of interpretable, nonlinear,
and logical machine learning classifier involving IF-THEN
rules. Unlike traditional decision tree algorithms like CART
and C5.0, this method does not use greedy splitting and
pruning. Instead, it aims to fully optimize a combination of
accuracy and sparsity, obeying user-defined constraints.
This method is useful for producing non-black-box predictive
models, and has the benefit of a clear user-defined tradeoff
between training accuracy and sparsity. The flexible
framework of mathematical programming allows users to create
customized models with a provable guarantee of optimality.
The software reviewed as part of this submission was given
the DOI (Digital Object Identifier) https://doi.org/10.5281/zenodo.1344142.},
Doi = {10.1007/s12532-018-0143-8},
Key = {fds339230}
}
@article{fds361531,
Author = {Parikh, H and Rudin, C and Volfovsky, A},
Title = {MALTS: Matching After Learning to Stretch},
Journal = {Journal.of.Machine.Learning.Research 23(240) (2022)
1-42},
Year = {2018},
Month = {November},
Abstract = {We introduce a flexible framework that produces high-quality
almost-exact matches for causal inference. Most prior work
in matching uses ad-hoc distance metrics, often leading to
poor quality matches, particularly when there are irrelevant
covariates. In this work, we learn an interpretable distance
metric for matching, which leads to substantially higher
quality matches. The learned distance metric stretches the
covariate space according to each covariate's contribution
to outcome prediction: this stretching means that mismatches
on important covariates carry a larger penalty than
mismatches on irrelevant covariates. Our ability to learn
flexible distance metrics leads to matches that are
interpretable and useful for the estimation of conditional
average treatment effects.},
Key = {fds361531}
}
@article{fds339633,
Author = {Rudin, C and Ustunb, B},
Title = {Optimized scoring systems: Toward trust in machine learning
for healthcare and criminal justice},
Journal = {Interfaces},
Volume = {48},
Number = {5},
Pages = {449-466},
Publisher = {Institute for Operations Research and the Management
Sciences (INFORMS)},
Year = {2018},
Month = {September},
url = {http://dx.doi.org/10.1287/inte.2018.0957},
Abstract = {Abstract. Questions of trust in machine-learning models are
becoming increasingly important as these tools are starting
to be used widely for high-stakes decisions in medicine and
criminal justice. Transparency of models is a key aspect
affecting trust. This paper reveals that there is new
technology to build transparent machine-learning models that
are often as accurate as black-box machine-learning models.
These methods have already had an impact in medicine and
criminal justice. This work calls into question the overall
need for black-box models in these applications.
Copyright:},
Doi = {10.1287/inte.2018.0957},
Key = {fds339633}
}
@article{fds346813,
Author = {Liu, Y and Dieng, A and Roy, S and Rudin, C and Volfovsky,
A},
Title = {Interpretable Almost Matching Exactly for Causal
Inference},
Volume = {abs/1806.06802},
Year = {2018},
Month = {June},
Abstract = {We aim to create the highest possible quality of
treatment-control matches for categorical data in the
potential outcomes framework. Matching methods are heavily
used in the social sciences due to their interpretability,
but most matching methods do not pass basic sanity checks:
they fail when irrelevant variables are introduced, and tend
to be either computationally slow or produce low-quality
matches. The method proposed in this work aims to match
units on a weighted Hamming distance, taking into account
the relative importance of the covariates; the algorithm
aims to match units on as many relevant variables as
possible. To do this, the algorithm creates a hierarchy of
covariate combinations on which to match (similar to
downward closure), in the process solving an optimization
problem for each unit in order to construct the optimal
matches. The algorithm uses a single dynamic program to
solve all of the optimization problems simultaneously.
Notable advantages of our method over existing matching
procedures are its high-quality matches, versatility in
handling different data distributions that may have
irrelevant variables, and ability to handle missing data by
matching on as many available covariates as
possible.},
Key = {fds346813}
}
@article{fds332384,
Author = {Vu, M-AT and Adalı, T and Ba, D and Buzsáki, G and Carlson, D and Heller,
K and Liston, C and Rudin, C and Sohal, VS and Widge, AS and Mayberg, HS and Sapiro, G and Dzirasa, K},
Title = {A Shared Vision for Machine Learning in Neuroscience.},
Journal = {J Neurosci},
Volume = {38},
Number = {7},
Pages = {1601-1607},
Publisher = {Society for Neuroscience},
Year = {2018},
Month = {February},
url = {http://dx.doi.org/10.1523/JNEUROSCI.0508-17.2018},
Abstract = {With ever-increasing advancements in technology,
neuroscientists are able to collect data in greater volumes
and with finer resolution. The bottleneck in understanding
how the brain works is consequently shifting away from the
amount and type of data we can collect and toward what we
actually do with the data. There has been a growing interest
in leveraging this vast volume of data across levels of
analysis, measurement techniques, and experimental paradigms
to gain more insight into brain function. Such efforts are
visible at an international scale, with the emergence of big
data neuroscience initiatives, such as the BRAIN initiative
(Bargmann et al., 2014), the Human Brain Project, the Human
Connectome Project, and the National Institute of Mental
Health's Research Domain Criteria initiative. With these
large-scale projects, much thought has been given to
data-sharing across groups (Poldrack and Gorgolewski, 2014;
Sejnowski et al., 2014); however, even with such
data-sharing initiatives, funding mechanisms, and
infrastructure, there still exists the challenge of how to
cohesively integrate all the data. At multiple stages and
levels of neuroscience investigation, machine learning holds
great promise as an addition to the arsenal of analysis
tools for discovering how the brain works.},
Doi = {10.1523/JNEUROSCI.0508-17.2018},
Key = {fds332384}
}
@article{fds338419,
Author = {Angelino, E and Larus-Stone, N and Alabi, D and Seltzer, M and Rudin,
C},
Title = {Learning certifiably optimal rule lists for categorical
data},
Journal = {Journal of Machine Learning Research},
Volume = {18},
Pages = {1-78},
Year = {2018},
Month = {January},
Abstract = {We present the design and implementation of a custom
discrete optimization technique for building rule lists over
a categorical feature space. Our algorithm produces rule
lists with optimal training performance, according to the
regularized empirical risk, with a certificate of
optimality. By leveraging algorithmic bounds, efficient data
structures, and computational reuse, we achieve several
orders of magnitude speedup in time and a massive reduction
of memory consumption. We demonstrate that our approach
produces optimal rule lists on practical problems in
seconds. Our results indicate that it is possible to
construct optimal sparse rule lists that are approximately
as accurate as the COMPAS proprietary risk prediction tool
on data from Broward County, Florida, but that are
completely interpretable. This framework is a novel
alternative to CART and other decision tree methods for
interpretable modeling.},
Key = {fds338419}
}
@article{fds341025,
Author = {Li, O and Liu, H and Chen, C and Rudin, C},
Title = {Deep learning for case-based reasoning through prototypes: A
neural network that explains its predictions},
Journal = {32nd AAAI Conference on Artificial Intelligence, AAAI
2018},
Pages = {3530-3537},
Year = {2018},
Month = {January},
ISBN = {9781577358008},
Abstract = {Deep neural networks are widely used for classification.
These deep models often suffer from a lack of
interpretability - they are particularly difficult to
understand because of their non-linear nature. As a result,
neural networks are often treated as “black box” models,
and in the past, have been trained purely to optimize the
accuracy of predictions. In this work, we create a novel
network architecture for deep learning that naturally
explains its own reasoning for each prediction. This
architecture contains an autoencoder and a special prototype
layer, where each unit of that layer stores a weight vector
that resembles an encoded training input. The encoder of the
autoencoder allows us to do comparisons within the latent
space, while the decoder allows us to visualize the learned
prototypes. The training objective has four terms: an
accuracy term, a term that encourages every prototype to be
similar to at least one encoded input, a term that
encourages every encoded input to be close to at least one
prototype, and a term that encourages faithful
reconstruction by the autoencoder. The distances computed in
the prototype layer are used as part of the classification
process. Since the prototypes are learned during training,
the learned network naturally comes with explanations for
each prediction, and the explanations are loyal to what the
network actually computes.},
Key = {fds341025}
}
@article{fds344609,
Author = {Chen, C and Rudin, C},
Title = {An optimization approach to learning falling rule
lists},
Journal = {International Conference on Artificial Intelligence and
Statistics, AISTATS 2018},
Pages = {604-612},
Year = {2018},
Month = {January},
Abstract = {A falling rule list is a probabilistic decision list for
binary classification, consisting of a series of if-then
rules with antecedents in the if clauses and probabilities
of the desired outcome (“1”) in the then clauses. Just
as in a regular decision list, the order of rules in a
falling rule list is important – each example is
classified by the first rule whose antecedent it satisfies.
Unlike a regular decision list, a falling rule list requires
the probabilities of the desired outcome (“1”) to be
monotonically decreasing down the list. We propose an
optimization approach to learning falling rule lists and
“softly” falling rule lists, along with Monte-Carlo
search algorithms that use bounds on the optimal solution to
prune the search space.},
Key = {fds344609}
}
@article{fds344608,
Author = {Rudin, C and Wang, Y},
Title = {Direct learning to rank and rerank},
Journal = {International Conference on Artificial Intelligence and
Statistics, AISTATS 2018},
Pages = {775-783},
Year = {2018},
Month = {January},
Abstract = {Learning-to-rank techniques have proven to be extremely
useful for prioritization problems, where we rank items in
order of their estimated probabilities, and dedicate our
limited resources to the top-ranked items. This work exposes
a serious problem with the state of learning-to-rank
algorithms, which is that they are based on convex proxies
that lead to poor approximations. We then discuss the
possibility of “exact” reranking algorithms based on
mathematical programming. We prove that a relaxed version of
the “exact” problem has the same optimal solution, and
provide an empirical analysis.},
Key = {fds344608}
}
@article{fds331943,
Author = {Struck, AF and Ustun, B and Ruiz, AR and Lee, JW and LaRoche, SM and Hirsch, LJ and Gilmore, EJ and Vlachy, J and Haider, HA and Rudin, C and Westover, MB},
Title = {Association of an Electroencephalography-Based Risk Score
With Seizure Probability in Hospitalized
Patients.},
Journal = {JAMA neurology},
Volume = {74},
Number = {12},
Pages = {1419-1424},
Year = {2017},
Month = {December},
url = {http://dx.doi.org/10.1001/jamaneurol.2017.2459},
Abstract = {<h4>Importance</h4>Continuous electroencephalography (EEG)
use in critically ill patients is expanding. There is no
validated method to combine risk factors and guide
clinicians in assessing seizure risk.<h4>Objective</h4>To
use seizure risk factors from EEG and clinical history to
create a simple scoring system associated with the
probability of seizures in patients with acute
illness.<h4>Design, setting, and participants</h4>We used a
prospective multicenter (Emory University Hospital, Brigham
and Women's Hospital, and Yale University Hospital) database
containing clinical and electrographic variables on 5427
continuous EEG sessions from eligible patients if they had
continuous EEG for clinical indications, excluding epilepsy
monitoring unit admissions. We created a scoring system
model to estimate seizure risk in acutely ill patients
undergoing continuous EEG. The model was built using a new
machine learning method (RiskSLIM) that is designed to
produce accurate, risk-calibrated scoring systems with a
limited number of variables and small integer weights. We
validated the accuracy and risk calibration of our model
using cross-validation and compared its performance with
models built with state-of-the-art logistic regression
methods. The database was developed by the Critical Care EEG
Research Consortium and used data collected over 3 years.
The EEG variables were interpreted using standardized
terminology by certified reviewers.<h4>Exposures</h4>All
patients had more than 6 hours of uninterrupted EEG
recordings.<h4>Main outcomes and measures</h4>The main
outcome was the average risk calibration
error.<h4>Results</h4>There were 5427 continuous EEGs
performed on 4772 participants (2868 men, 49.9%; median age,
61 years) performed at 3 institutions, without further
demographic stratification. Our final model, 2HELPS2B, had
an area under the curve of 0.819 and average calibration
error of 2.7% (95% CI, 2.0%-3.6%). It included 6 variables
with the following point assignments: (1) brief (ictal)
rhythmic discharges (B[I]RDs) (2 points); (2) presence of
lateralized periodic discharges, lateralized rhythmic delta
activity, or bilateral independent periodic discharges (1
point); (3) prior seizure (1 point); (4) sporadic
epileptiform discharges (1 point); (5) frequency greater
than 2.0 Hz for any periodic or rhythmic pattern (1 point);
and (6) presence of "plus" features (superimposed, rhythmic,
sharp, or fast activity) (1 point). The probable seizure
risk of each score was 5% for a score of 0, 12% for a score
of 1, 27% for a score of 2, 50% for a score of 3, 73% for a
score of 4, 88% for a score of 5, and greater than 95% for a
score of 6 or 7.<h4>Conclusions and relevance</h4>The
2HELPS2B model is a quick accurate tool to aid clinical
judgment of the risk of seizures in critically ill
patients.},
Doi = {10.1001/jamaneurol.2017.2459},
Key = {fds331943}
}
@article{fds330620,
Author = {Ustun, B and Rudin, C},
Title = {Optimized risk scores},
Journal = {Proceedings of the ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining},
Volume = {Part F129685},
Pages = {1125-1134},
Year = {2017},
Month = {August},
ISBN = {9781450348874},
url = {http://dx.doi.org/10.1145/3097983.3098161},
Abstract = {Risk scores are simple classification models that let users
quickly assess risk by adding, subtracting, and multiplying
a few small numbers. Such models are widely used in
healthcare and criminal justice, but are often built ad hoc.
In this paper, we present a principled approach to learn
risk scores that are fully optimized for feature selection,
integer coefficients, and operational constraints. We
formulate the risk score problem as a mixed integer
nonlinear program, and present a new cutting plane algorithm
to efficiently recover its optimal solution. Our approach
can fit optimized risk scores in a way that scales linearly
with the sample size of a dataset, provides a proof of
optimality, and obeys complex constraints without parameter
tuning. We illustrate these benefits through an extensive
set of numerical experiments, and an application where we
build a customized risk score for ICU seizure
prediction.},
Doi = {10.1145/3097983.3098161},
Key = {fds330620}
}
@article{fds330621,
Author = {Angelino, E and Larus-Stone, N and Alabi, D and Seltzer, M and Rudin,
C},
Title = {Learning certifiably optimal rule lists},
Journal = {Proceedings of the ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining},
Volume = {Part F129685},
Pages = {35-44},
Publisher = {ACM Press},
Year = {2017},
Month = {August},
ISBN = {9781450348874},
url = {http://dx.doi.org/10.1145/3097983.3098047},
Abstract = {We present the design and implementation of a custom
discrete optimization technique for building rule lists over
a categorical feature space. Our algorithm provides the
optimal solution, with a certificate of optimality. By
leveraging algorithmic bounds, efficient data structures,
and computational reuse, we achieve several orders of
magnitude speedup in time and a massive reduction of memory
consumption. We demonstrate that our approach produces
optimal rule lists on practical problems in seconds. This
framework is a novel alternative to CART and other decision
tree methods.},
Doi = {10.1145/3097983.3098047},
Key = {fds330621}
}
@article{fds330622,
Author = {Wang, T and Rudin, C and Doshi-Velez, F and Liu, Y and Klampfl, E and MacNeille, P},
Title = {A Bayesian framework for learning rule sets for
interpretable classification},
Journal = {Journal of Machine Learning Research},
Volume = {18},
Pages = {1-37},
Year = {2017},
Month = {August},
Abstract = {We present a machine learning algorithm for building
classifiers that are comprised of a small number of short
rules. These are restricted disjunctive normal form models.
An example of a classifier of this form is as follows: If X
satisfies (condition A AND condition B) OR (condition C) OR
· · · , then Y = 1. Models of this form have the
advantage of being interpretable to human experts since they
produce a set of rules that concisely describe a specific
class. We present two probabilistic models with prior
parameters that the user can set to encourage the model to
have a desired size and shape, to conform with a
domain-specific definition of interpretability. We provide a
scalable MAP inference approach and develop theoretical
bounds to reduce computation by iteratively pruning the
search space. We apply our method (Bayesian Rule Sets –
BRS) to characterize and predict user behavior with respect
to in-vehicle context-aware personalized recommender
systems. Our method has a major advantage over classical
associative classification methods and decision trees in
that it does not greedily grow the model.},
Key = {fds330622}
}
@article{fds330623,
Author = {Letham, B and Letham, PA and Rudin, C and Browne,
EP},
Title = {Erratum: "Prediction uncertainty and optimal experimental
design for learning dynamical systems" [Chaos 26, 063110
(2016)].},
Journal = {Chaos (Woodbury, N.Y.)},
Volume = {27},
Number = {6},
Pages = {069901},
Year = {2017},
Month = {June},
url = {http://dx.doi.org/10.1063/1.4986799},
Doi = {10.1063/1.4986799},
Key = {fds330623}
}
@article{fds330624,
Author = {Zeng, J and Ustun, B and Rudin, C},
Title = {Interpretable classification models for recidivism
prediction},
Journal = {Journal of the Royal Statistical Society. Series A:
Statistics in Society},
Volume = {180},
Number = {3},
Pages = {689-722},
Publisher = {WILEY},
Year = {2017},
Month = {June},
url = {http://dx.doi.org/10.1111/rssa.12227},
Abstract = {We investigate a long-debated question, which is how to
create predictive models of recidivism that are sufficiently
accurate, transparent and interpretable to use for decision
making. This question is complicated as these models are
used to support different decisions, from sentencing, to
determining release on probation to allocating preventative
social services. Each case might have an objective other
than classification accuracy, such as a desired true
positive rate TPR or false positive rate FPR. Each (TPR,
FPR) pair is a point on the receiver operator characteristic
(ROC) curve. We use popular machine learning methods to
create models along the full ROC curve on a wide range of
recidivism prediction problems. We show that many methods
(support vector machines, stochastic gradient boosting and
ridge regression) produce equally accurate models along the
full ROC curve. However, methods that are designed for
interpretability (classification and regression trees and
C5.0) cannot be tuned to produce models that are accurate
and/or interpretable. To handle this shortcoming, we use a
recent method called supersparse linear integer models to
produce accurate, transparent and interpretable scoring
systems along the full ROC curve. These scoring systems can
be used for decision making for many different use cases,
since they are just as accurate as the most powerful black
box machine learning models for many applications, but
completely transparent, and highly interpretable.},
Doi = {10.1111/rssa.12227},
Key = {fds330624}
}
@article{fds330625,
Author = {Ustun, B and Adler, LA and Rudin, C and Faraone, SV and Spencer, TJ and Berglund, P and Gruber, MJ and Kessler, RC},
Title = {The World Health Organization Adult Attention-Deficit/Hyperactivity
Disorder Self-Report Screening Scale for
DSM-5.},
Journal = {JAMA psychiatry},
Volume = {74},
Number = {5},
Pages = {520-527},
Year = {2017},
Month = {May},
url = {http://dx.doi.org/10.1001/jamapsychiatry.2017.0298},
Abstract = {<h4>Importance</h4>Recognition that adult
attention-deficit/hyperactivity disorder (ADHD) is common,
seriously impairing, and usually undiagnosed has led to the
development of adult ADHD screening scales for use in
community, workplace, and primary care settings. However,
these scales are all calibrated to DSM-IV criteria, which
are narrower than the recently developed DSM-5
criteria.<h4>Objectives</h4>To update for DSM-5 criteria and
improve the operating characteristics of the widely used
World Health Organization Adult ADHD Self-Report Scale
(ASRS) for screening.<h4>Design, setting, and
participants</h4>Probability subsamples of participants in 2
general population surveys (2001-2003 household survey
[n = 119] and 2004-2005 managed care subscriber survey
[n = 218]) who completed the full 29-question
self-report ASRS, with both subsamples over-sampling
ASRS-screened positives, were blindly administered a
semistructured research diagnostic interview for DSM-5 adult
ADHD. In 2016, the Risk-Calibrated Supersparse Linear
Integer Model, a novel machine-learning algorithm designed
to create screening scales with optimal integer weights and
limited numbers of screening questions, was applied to the
pooled data to create a DSM-5 version of the ASRS screening
scale. The accuracy of the new scale was then confirmed in
an independent 2011-2012 clinical sample of patients seeking
evaluation at the New York University Langone Medical Center
Adult ADHD Program (NYU Langone) and 2015-2016 primary care
controls (n = 300). Data analysis was conducted from
April 4, 2016, to September 22, 2016.<h4>Main outcomes and
measures</h4>The sensitivity, specificity, area under the
curve (AUC), and positive predictive value (PPV) of the
revised ASRS.<h4>Results</h4>Of the total 637 participants,
44 (37.0%) household survey respondents, 51 (23.4%) managed
care respondents, and 173 (57.7%) NYU Langone respondents
met DSM-5 criteria for adult ADHD in the semistructured
diagnostic interview. Of the respondents who met DSM-5
criteria for adult ADHD, 123 were male (45.9%); mean (SD)
age was 33.1 (11.4) years. A 6-question screening scale was
found to be optimal in distinguishing cases from noncases in
the first 2 samples. Operating characteristics were
excellent at the diagnostic threshold in the weighted (to
the 8.2% DSM-5/Adult ADHD Clinical Diagnostic Scale
population prevalence) data (sensitivity, 91.4%;
specificity, 96.0%; AUC, 0.94; PPV, 67.3%). Operating
characteristics were similar despite a much higher
prevalence (57.7%) when the scale was applied to the NYU
Langone clinical sample (sensitivity, 91.9%; specificity,
74.0%; AUC, 0.83; PPV, 82.8%).<h4>Conclusions and
relevance</h4>The new ADHD screening scale is short, easily
scored, detects the vast majority of general population
cases at a threshold that also has high specificity and PPV,
and could be used as a screening tool in specialty treatment
settings.},
Doi = {10.1001/jamapsychiatry.2017.0298},
Key = {fds330625}
}
@article{fds336348,
Author = {Yang, H and Rudin, C and Seltzer, M},
Title = {Scalable Bayesian rule lists},
Journal = {34th International Conference on Machine Learning, ICML
2017},
Volume = {8},
Pages = {5971-5980},
Year = {2017},
Month = {January},
ISBN = {9781510855144},
Abstract = {We present an algorithm for building probabilistic rule
lists that is two orders of magnitude faster than previous
work. Rule list algorithms are competitors for decision tree
algorithms. They are associative classifiers, in that they
are built from pre-mined association rules. They have a
logical structure that is a sequence of IF-THEN rules,
identical to a decision list or one-sided decision tree.
Instead of using greedy splitting and pruning like decision
tree algorithms, we aim to fully optimize over rule lists,
striking a practical balance between accuracy,
inter-pretability, and computational speed. The algorithm
presented here uses a mixture of theoretical bounds (tight
enough to have practical implications as a screening or
bounding procedure), computational reuse, and highly tuned
language libraries to achieve computational efficiency.
Currently, for many practical problems, this method achieves
better accuracy and sparsity than decision trees, with
practical running times. The predictions in each leaf are
probabilistic.},
Key = {fds336348}
}
@article{fds344610,
Author = {Lakkaraju, H and Rudin, C},
Title = {Learning cost-effective and interpretable treatment
regimes},
Journal = {Proceedings of the 20th International Conference on
Artificial Intelligence and Statistics, AISTATS
2017},
Year = {2017},
Month = {January},
Abstract = {© 2017 PMLR. All rights reserved. Decision makers, such as
doctors and judges, make crucial decisions such as
recommending treatments to patients, and granting bail to
defendants on a daily basis. Such decisions typically
involve weighing the potential benefits of taking an action
against the costs involved. In this work, we aim to automate
this task of learning cost-effective, interpretable and
actionable treatment regimes. We formulate this as a problem
of learning a decision list – a sequence of if-then-else
rules – that maps characteristics of subjects (eg.,
diagnostic test results of patients) to treatments. This
yields an end-to-end individualized policy for tests and
treatments. We propose a novel objective to construct a
decision list which maximizes outcomes for the population,
and minimizes overall costs. Since we do not observe the
outcomes corresponding to counterfactual scenarios, we use
techniques from causal inference literature to infer them.
We model the problem of learning the decision list as a
Markov Decision Process (MDP) and employ a variant of the
Upper Confidence Bound for Trees (UCT) strategy which
leverages customized checks for pruning the search space
effectively. Experimental results on real world
observational data capturing judicial bail decisions and
treatment recommendations for asthma patients demonstrate
the effectiveness of our approach.},
Key = {fds344610}
}
@article{fds349524,
Author = {Lakkaraju, H and Rudin, C},
Title = {Learning cost-effective and interpretable treatment
regimes},
Journal = {Proceedings of the 20th International Conference on
Artificial Intelligence and Statistics, AISTATS
2017},
Year = {2017},
Month = {January},
Abstract = {Decision makers, such as doctors and judges, make crucial
decisions such as recommending treatments to patients, and
granting bail to defendants on a daily basis. Such decisions
typically involve weighing the potential benefits of taking
an action against the costs involved. In this work, we aim
to automate this task of learning cost-effective,
interpretable and actionable treatment regimes. We formulate
this as a problem of learning a decision list – a sequence
of if-then-else rules – that maps characteristics of
subjects (eg., diagnostic test results of patients) to
treatments. This yields an end-to-end individualized policy
for tests and treatments. We propose a novel objective to
construct a decision list which maximizes outcomes for the
population, and minimizes overall costs. Since we do not
observe the outcomes corresponding to counterfactual
scenarios, we use techniques from causal inference
literature to infer them. We model the problem of learning
the decision list as a Markov Decision Process (MDP) and
employ a variant of the Upper Confidence Bound for Trees
(UCT) strategy which leverages customized checks for pruning
the search space effectively. Experimental results on real
world observational data capturing judicial bail decisions
and treatment recommendations for asthma patients
demonstrate the effectiveness of our approach.},
Key = {fds349524}
}
@article{fds330627,
Author = {Letham, B and Letham, LM and Rudin, C},
Title = {Bayesian inference of arrival rate and substitution behavior
from sales transaction data with stockouts},
Journal = {Proceedings of the ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining},
Volume = {13-17-August-2016},
Pages = {1695-1704},
Publisher = {ACM Press},
Year = {2016},
Month = {August},
ISBN = {9781450342322},
url = {http://dx.doi.org/10.1145/2939672.2939810},
Abstract = {When an item goes out of stock, sales transaction data no
longer reflect the original customer demand, since some
customers leave with no purchase while others substitute
alternative products for the one that was out of stock. Here
we develop a Bayesian hierarchical model for inferring the
underlying customer arrival rate and choice model from sales
transaction data and the corresponding stock levels. The
model uses a nonhomogeneous Poisson process to allow the
arrival rate to vary throughout the day, and allows for a
variety of choice models. Model parameters are inferred
using a stochastic gradient MCMC algorithm that can scale to
large transaction databases. We fit the model to data from a
local bakery and show that it is able to make accurate
out-of-sample predictions, and to provide actionable insight
into lost cookie sales.},
Doi = {10.1145/2939672.2939810},
Key = {fds330627}
}
@article{fds330626,
Author = {Wang, T and Rudin, C and Velez-Doshi, F and Liu, Y and Klampfl, E and Macneille, P},
Title = {Bayesian rule sets for interpretable classification},
Journal = {Proceedings - IEEE International Conference on Data Mining,
ICDM},
Pages = {1269-1274},
Year = {2016},
Month = {July},
ISBN = {9781509054725},
url = {http://dx.doi.org/10.1109/ICDM.2016.130},
Abstract = {A Rule Set model consists of a small number of short rules
for interpretable classification, where an instance is
classified as positive if it satisfies at least one of the
rules. The rule set provides reasons for predictions, and
also descriptions of a particular class. We present a
Bayesian framework for learning Rule Set models, with prior
parameters that the user can set to encourage the model to
have a desired size and shape in order to conform with a
domain-specific definition of interpretability. We use an
efficient inference approach for searching for the MAP
solution and provide theoretical bounds to reduce
computation. We apply Rule Set models to ten UCI data sets
and compare the performance with other interpretable and
non-interpretable models.},
Doi = {10.1109/ICDM.2016.130},
Key = {fds330626}
}
@article{fds330628,
Author = {Moghaddass, R and Rudin, C and Madigan, D},
Title = {The factorized self-controlled case series method: An
approach for estimating the effects of many drugs on many
outcomes},
Journal = {Journal of Machine Learning Research},
Volume = {17},
Year = {2016},
Month = {June},
Abstract = {We provide a hierarchical Bayesian model for estimating the
effects of transient drug exposures on a collection of
health outcomes, where the effects of all drugs on all
outcomes are estimated simultaneously. The method possesses
properties that allow it to handle important challenges of
dealing with large-scale longitudinal observational
databases. In particular, this model is a generalization of
the self-controlled case series (SCCS) method, meaning that
certain patient specific baseline rates never need to be
estimated. Further, this model is formulated with layers of
latent factors, which substantially reduces the number of
parameters and helps with interpretability by illuminating
latent classes of drugs and outcomes. We believe our work is
the first to consider multivariate SCCS (in the sense of
multiple outcomes) and is the first to couple latent factor
analysis with SCCS. We demonstrate the approach by
estimating the effects of various time-sensitive insulin
treatments for diabetes.},
Key = {fds330628}
}
@article{fds330629,
Author = {Letham, B and Letham, PA and Rudin, C and Browne,
EP},
Title = {Prediction uncertainty and optimal experimental design for
learning dynamical systems.},
Journal = {Chaos (Woodbury, N.Y.)},
Volume = {26},
Number = {6},
Pages = {063110},
Year = {2016},
Month = {June},
url = {http://dx.doi.org/10.1063/1.4953795},
Abstract = {Dynamical systems are frequently used to model biological
systems. When these models are fit to data, it is necessary
to ascertain the uncertainty in the model fit. Here, we
present prediction deviation, a metric of uncertainty that
determines the extent to which observed data have
constrained the model's predictions. This is accomplished by
solving an optimization problem that searches for a pair of
models that each provides a good fit for the observed data,
yet has maximally different predictions. We develop a method
for estimating a priori the impact that additional
experiments would have on the prediction deviation, allowing
the experimenter to design a set of experiments that would
most reduce uncertainty. We use prediction deviation to
assess uncertainty in a model of interferon-alpha inhibition
of viral infection, and to select a sequence of experiments
that reduces this uncertainty. Finally, we prove a
theoretical result which shows that prediction deviation
provides bounds on the trajectories of the underlying true
model. These results show that prediction deviation is a
meaningful metric of uncertainty that can be used for
optimal experimental design.},
Doi = {10.1063/1.4953795},
Key = {fds330629}
}
@article{fds330630,
Author = {Souillard-Mandar, W and Davis, R and Rudin, C and Au, R and Libon, DJ and Swenson, R and Price, CC and Lamar, M and Penney,
DL},
Title = {Learning classification models of cognitive conditions from
subtle behaviors in the digital Clock Drawing
Test},
Journal = {Machine Learning},
Volume = {102},
Number = {3},
Pages = {393-441},
Publisher = {Springer Nature},
Year = {2016},
Month = {March},
url = {http://dx.doi.org/10.1007/s10994-015-5529-5},
Abstract = {The Clock Drawing Test—a simple pencil and paper
test—has been used for more than 50 years as a screening
tool to differentiate normal individuals from those with
cognitive impairment, and has proven useful in helping to
diagnose cognitive dysfunction associated with neurological
disorders such as Alzheimer’s disease, Parkinson’s
disease, and other dementias and conditions. We have been
administering the test using a digitizing ballpoint pen that
reports its position with considerable spatial and temporal
precision, making available far more detailed data about the
subject’s performance. Using pen stroke data from these
drawings categorized by our software, we designed and
computed a large collection of features, then explored the
tradeoffs in performance and interpretability in classifiers
built using a number of different subsets of these features
and a variety of different machine learning techniques. We
used traditional machine learning methods to build
prediction models that achieve high accuracy. We
operationalized widely used manual scoring systems so that
we could use them as benchmarks for our models. We worked
with clinicians to define guidelines for model
interpretability, and constructed sparse linear models and
rule lists designed to be as easy to use as scoring systems
currently used by clinicians, but more accurate. While our
models will require additional testing for validation, they
offer the possibility of substantial improvement in
detecting cognitive impairment earlier than currently
possible, a development with considerable potential impact
in practice.},
Doi = {10.1007/s10994-015-5529-5},
Key = {fds330630}
}
@article{fds330631,
Author = {Ustun, B and Rudin, C},
Title = {Supersparse linear integer models for optimized medical
scoring systems},
Journal = {Machine Learning},
Volume = {102},
Number = {3},
Pages = {349-391},
Publisher = {Springer Nature},
Year = {2016},
Month = {March},
url = {http://dx.doi.org/10.1007/s10994-015-5528-6},
Abstract = {Scoring systems are linear classification models that only
require users to add, subtract and multiply a few small
numbers in order to make a prediction. These models are in
widespread use by the medical community, but are difficult
to learn from data because they need to be accurate and
sparse, have coprime integer coefficients, and satisfy
multiple operational constraints. We present a new method
for creating data-driven scoring systems called a
Supersparse Linear Integer Model (SLIM). SLIM scoring
systems are built by using an integer programming problem
that directly encodes measures of accuracy (the 0–1 loss)
and sparsity (the (Formula presented.) -seminorm) while
restricting coefficients to coprime integers. SLIM can
seamlessly incorporate a wide range of operational
constraints related to accuracy and sparsity, and can
produce acceptable models without parameter tuning because
of the direct control provided over these quantities. We
provide bounds on the testing and training accuracy of SLIM
scoring systems, and present a new data reduction technique
that can improve scalability by eliminating a portion of the
training data beforehand. Our paper includes results from a
collaboration with the Massachusetts General Hospital Sleep
Laboratory, where SLIM is being used to create a highly
tailored scoring system for sleep apnea screening.},
Doi = {10.1007/s10994-015-5528-6},
Key = {fds330631}
}
@article{fds330632,
Author = {Ustun, B and Westover, MB and Rudin, C and Bianchi,
MT},
Title = {Clinical Prediction Models for Sleep Apnea: The Importance
of Medical History over Symptoms.},
Journal = {Journal of clinical sleep medicine : JCSM : official
publication of the American Academy of Sleep
Medicine},
Volume = {12},
Number = {2},
Pages = {161-168},
Year = {2016},
Month = {February},
url = {http://dx.doi.org/10.5664/jcsm.5476},
Abstract = {<h4>Study objective</h4>Obstructive sleep apnea (OSA) is a
treatable contributor to morbidity and mortality. However,
most patients with OSA remain undiagnosed. We used a new
machine learning method known as SLIM (Supersparse Linear
Integer Models) to test the hypothesis that a diagnostic
screening tool based on routinely available medical
information would be superior to one based solely on
patient-reported sleep-related symptoms.<h4>Methods</h4>We
analyzed polysomnography (PSG) and self-reported clinical
information from 1,922 patients tested in our clinical sleep
laboratory. We used SLIM and 7 state-of-the-art
classification methods to produce predictive models for OSA
screening using features from: (i) self-reported symptoms;
(ii) self-reported medical information that could, in
principle, be extracted from electronic health records
(demographics, comorbidities), or (iii) both.<h4>Results</h4>For
diagnosing OSA, we found that model performance using only
medical history features was superior to model performance
using symptoms alone, and similar to model performance using
all features. Performance was similar to that reported for
other widely used tools: sensitivity 64.2% and specificity
77%. SLIM accuracy was similar to state-of-the-art
classification models applied to this dataset, but with the
benefit of full transparency, allowing for hands-on
prediction using yes/no answers to a small number of
clinical queries.<h4>Conclusion</h4>To predict OSA,
variables such as age, sex, BMI, and medical history are
superior to the symptom variables we examined for predicting
OSA. SLIM produces an actionable clinical tool that can be
applied to data that is routinely available in modern
electronic health records, which may facilitate automated,
rather than manual, OSA screening.<h4>Commentary</h4>A
commentary on this article appears in this issue on page
159.},
Doi = {10.5664/jcsm.5476},
Key = {fds330632}
}
@article{fds330633,
Author = {Browne, EP and Letham, B and Rudin, C},
Title = {A Computational Model of Inhibition of HIV-1 by
Interferon-Alpha.},
Journal = {PloS one},
Volume = {11},
Number = {3},
Pages = {e0152316},
Year = {2016},
Month = {January},
url = {http://dx.doi.org/10.1371/journal.pone.0152316},
Abstract = {Type 1 interferons such as interferon-alpha (IFNα) inhibit
replication of Human immunodeficiency virus (HIV-1) by
upregulating the expression of genes that interfere with
specific steps in the viral life cycle. This pathway thus
represents a potential target for immune-based therapies
that can alter the dynamics of host-virus interactions to
benefit the host. To obtain a deeper mechanistic
understanding of how IFNα impacts spreading HIV-1
infection, we modeled the interaction of HIV-1 with CD4 T
cells and IFNα as a dynamical system. This model was then
tested using experimental data from a cell culture model of
spreading HIV-1 infection. We found that a model in which
IFNα induces reversible cellular states that block both
early and late stages of HIV-1 infection, combined with a
saturating rate of conversion to these states, was able to
successfully fit the experimental dataset. Sensitivity
analysis showed that the potency of inhibition by IFNα was
particularly dependent on specific network parameters and
rate constants. This model will be useful for designing new
therapies targeting the IFNα network in HIV-1-infected
individuals, as well as potentially serving as a template
for understanding the interaction of IFNα with other
viruses.},
Doi = {10.1371/journal.pone.0152316},
Key = {fds330633}
}
@article{fds344611,
Author = {Garg, VK and Rudin, C and Jaakkola, T},
Title = {CRAFT: ClusteR-specific Assorted Feature
selecTion},
Journal = {Proceedings of the 19th International Conference on
Artificial Intelligence and Statistics, AISTATS
2016},
Pages = {305-313},
Year = {2016},
Month = {January},
Abstract = {We present a hierarchical Bayesian framework for clustering
with cluster-specific feature selection. We derive a
simplified model, CRAFT, by analyzing the asymptotic
behavior of the log posterior formulations in a
nonparametric MAP-based clustering setting in this
framework. CRAFT handles assorted data, i.e., both numeric
and categorical data, and the underlying objective functions
are intuitively appealing. The resulting algorithm is simple
to implement and scales nicely, requires minimal parameter
tuning, obviates the need to specify the number of clusters
a priori, and compares favorably with other state-of-the-art
methods on several datasets. We provide empirical evidence
on carefully designed synthetic data sets to highlight the
robustness of the algorithm to recover the underlying
feature subspaces, even when the average dimensionality of
the features across clusters is misspecified. Besides, the
framework seamlessly allows for multiple views of clustering
by interpolating between the two extremes of
cluster-specific feature selection and global selection, and
recovers the DP-means objective [14] under the degenerate
setting of clustering without feature selection.},
Key = {fds344611}
}
@article{fds330634,
Author = {Ertekin, Ş and Rudin, C},
Title = {A bayesian approach to learning scoring systems},
Journal = {Big Data},
Volume = {3},
Number = {4},
Pages = {267-276},
Publisher = {MARY ANN LIEBERT, INC},
Year = {2015},
Month = {December},
url = {http://dx.doi.org/10.1089/big.2015.0033},
Abstract = {We present a Bayesian method for building scoring systems,
which are linear models with coefficients that have very few
significant digits. Usually the construction of scoring
systems involve manual effort - humans invent the full
scoring system without using data, or they choose how
logistic regression coefficients should be scaled and
rounded to produce a scoring system. These kinds of
heuristics lead to suboptimal solutions. Our approach is
different in that humans need only specify the prior over
what the coefficients should look like, and the scoring
system is learned from data. For this approach, we provide a
Metropolis-Hastings sampler that tends to pull the
coefficient values toward their "natural scale."
Empirically, the proposed method achieves a high degree of
interpretability of the models while maintaining competitive
generalization performances.},
Doi = {10.1089/big.2015.0033},
Key = {fds330634}
}
@article{fds330635,
Author = {Moghaddass, R and Rudin, C},
Title = {The latent state hazard model, with application to wind
turbine reliability},
Journal = {Annals of Applied Statistics},
Volume = {9},
Number = {4},
Pages = {1823-1863},
Publisher = {Institute of Mathematical Statistics},
Year = {2015},
Month = {December},
url = {http://dx.doi.org/10.1214/15-AOAS859},
Abstract = {We present a new model for reliability analysis that is able
to distinguish the latent internal vulnerability state of
the equipment from the vulnerability caused by temporary
external sources. Consider a wind farm where each turbine is
running under the external effects of temperature, wind
speed and direction, etc. The turbine might fail because of
the external effects of a spike in temperature. If it does
not fail during the temperature spike, it could still fail
due to internal degradation, and the spike could cause (or
be an indication of) this degradation. The ability to
identify the underlying latent state can help better
understand the effects of external sources and thus lead to
more robust decision-making. We present an experimental
study using SCADA sensor measurements from wind turbines in
Italy.},
Doi = {10.1214/15-AOAS859},
Key = {fds330635}
}
@article{fds330637,
Author = {Tulabandhula, T and Rudin, C},
Title = {Generalization bounds for learning with linear, polygonal,
quadratic and conic side knowledge},
Journal = {Machine Learning},
Volume = {100},
Number = {2-3},
Pages = {183-216},
Publisher = {Springer Nature},
Year = {2015},
Month = {September},
url = {http://dx.doi.org/10.1007/s10994-014-5478-4},
Abstract = {In this paper, we consider a supervised learning setting
where side knowledge is provided about the labels of
unlabeled examples. The side knowledge has the effect of
reducing the hypothesis space, leading to tighter
generalization bounds, and thus possibly better
generalization. We consider several types of side knowledge,
the first leading to linear and polygonal constraints on the
hypothesis space, the second leading to quadratic
constraints, and the last leading to conic constraints. We
show how different types of domain knowledge can lead
directly to these kinds of side knowledge. We prove bounds
on complexity measures of the hypothesis space for quadratic
and conic side knowledge, and show that these bounds are
tight in a specific sense for the quadratic
case.},
Doi = {10.1007/s10994-014-5478-4},
Key = {fds330637}
}
@article{fds330636,
Author = {Letham, B and Rudin, C and McCormick, TH and Madigan,
D},
Title = {Interpretable classifiers using rules and bayesian analysis:
Building a better stroke prediction model},
Journal = {Annals of Applied Statistics},
Volume = {9},
Number = {3},
Pages = {1350-1371},
Publisher = {Institute of Mathematical Statistics},
Year = {2015},
Month = {September},
url = {http://dx.doi.org/10.1214/15-AOAS848},
Abstract = {We aim to produce predictive models that are not only
accurate, but are also interpretable to human experts. Our
models are decision lists, which consist of a series of if
… then. . . statements (e.g., if high blood pressure, then
stroke) that discretize a high-dimensional, multivariate
feature space into a series of simple, readily interpretable
decision statements. We introduce a generative model called
Bayesian Rule Lists that yields a posterior distribution
over possible decision lists. It employs a novel prior
structure to encourage sparsity. Our experiments show that
Bayesian Rule Lists has predictive accuracy on par with the
current top algorithms for prediction in machine learning.
Our method is motivated by recent developments in
personalized medicine, and can be used to produce highly
accurate and interpretable medical scoring systems. We
demonstrate this by producing an alternative to the CHADS2
score, actively used in clinical practice for estimating the
risk of stroke in patients that have atrial fibrillation.
Our model is as interpretable as CHADS2, but more
accurate.},
Doi = {10.1214/15-AOAS848},
Key = {fds330636}
}
@article{fds330638,
Author = {Wang, T and Rudin, C and Wagner, D and Sevieri, R},
Title = {Finding Patterns with a Rotten Core: Data Mining for Crime
Series with Cores},
Journal = {Big Data},
Volume = {3},
Number = {1},
Pages = {3-21},
Publisher = {MARY ANN LIEBERT, INC},
Year = {2015},
Month = {March},
url = {http://dx.doi.org/10.1089/big.2014.0021},
Abstract = {One of the most challenging problems facing crime analysts
is that of identifying crime series, which are sets of
crimes committed by the same individual or group. Detecting
crime series can be an important step in predictive
policing, as knowledge of a pattern can be of paramount
importance toward finding the offenders or stopping the
pattern. Currently, crime analysts detect crime series
manually; our goal is to assist them by providing automated
tools for discovering crime series from within a database of
crimes. Our approach relies on a key hypothesis that each
crime series possesses at least one core of crimes that are
very similar to each other, which can be used to
characterize the modus operandi (M.O.) of the criminal.
Based on this assumption, as long as we find all of the
cores in the database, we have found a piece of each crime
series. We propose a subspace clustering method, where the
subspace is the M.O. of the series. The method has three
steps: We first construct a similarity graph to link crimes
that are generally similar, second we find cores of crime
using an integer linear programming approach, and third we
construct the rest of the crime series by merging cores to
form the full crime series. To judge whether a set of crimes
is indeed a core, we consider both pattern-general
similarity, which can be learned from past crime series, and
pattern-specific similarity, which is specific to the M.O.
of the series and cannot be learned. Our method can be used
for general pattern detection beyond crime series detection,
as cores exist for patterns in many domains.},
Doi = {10.1089/big.2014.0021},
Key = {fds330638}
}
@article{fds330639,
Author = {Ertekin, Ş and Rudin, C and McCormick, TH},
Title = {Reactive point processes: A new approach to predicting power
failures in underground electrical systems},
Journal = {Annals of Applied Statistics},
Volume = {9},
Number = {1},
Pages = {122-144},
Publisher = {Institute of Mathematical Statistics},
Year = {2015},
Month = {January},
url = {http://dx.doi.org/10.1214/14-AOAS789},
Abstract = {Reactive point processes (RPPs) are a new statistical model
designed for predicting discrete events in time based on
past history. RPPs were developed to handle an important
problem within the domain of electrical grid reliability:
short-term prediction of electrical grid failures
(“manhole events”), including outages, fires, explosions
and smoking manholes, which can cause threats to public
safety and reliability of electrical service in cities. RPPs
incorporate self-exciting, self-regulating and saturating
components. The selfexcitement occurs as a result of a past
event, which causes a temporary rise in vulner ability to
future events. The self-regulation occurs as a result of an
external inspection which temporarily lowers vulnerability
to future events. RPPs can saturate when too many events or
inspections occur close together, which ensures that the
probability of an event stays within a realistic range. Two
of the operational challenges for power companies are (i)
making continuous-time failure predictions, and (ii)
cost/benefit analysis for decision making and proactive
maintenance. RPPs are naturally suited for handling both of
these challenges. We use the model to predict power-grid
failures in Manhattan over a short-term horizon, and to
provide a cost/benefit analysis of different proactive
maintenance programs.},
Doi = {10.1214/14-AOAS789},
Key = {fds330639}
}
@article{fds330640,
Author = {Wang, F and Rudin, C},
Title = {Falling rule lists},
Journal = {Journal of Machine Learning Research},
Volume = {38},
Pages = {1013-1022},
Year = {2015},
Month = {January},
Abstract = {Falling rule lists are classification models consisting of
an ordered list of if-then rules, where (i) the order of
rules determines which example should be classified by each
rule, and (ii) the estimated probability of success
decreases monotonically down the list. These kinds of rule
lists are inspired by healthcare applications where patients
would be stratified into risk sets and the highest at-risk
patients should be considered first. We provide a Bayesian
framework for learning falling rule lists that does not rely
on traditional greedy decision tree learning
methods.},
Key = {fds330640}
}
@article{fds330641,
Author = {Rudin, C},
Title = {Turning prediction tools into decision tools},
Journal = {Lecture Notes in Computer Science (including subseries
Lecture Notes in Artificial Intelligence and Lecture Notes
in Bioinformatics)},
Volume = {9355},
Year = {2015},
Month = {January},
ISBN = {9783319244853},
Abstract = {Arguably, the main stumbling block in getting machine
learning algorithms used in practice is the fact that people
do not trust them. There could be many reasons for this, for
instance, perhaps the models are not sparse or transparent,
or perhaps the models are not able to be customized to the
user’s specifications as to what a decision tool should
look like. I will discuss some recent work from the
Prediction Analysis Lab on how to build machine learning
models that have helpful decision-making properties. I will
show how these models are applied to problems in healthcare
and criminology.},
Key = {fds330641}
}
@article{fds330647,
Author = {Tulabandhula, T and Rudin, C},
Title = {Tire changes, fresh air, and yellow flags: Challenges in
predictive analytics for professional racing},
Journal = {Big Data},
Volume = {2},
Number = {2},
Pages = {97-112},
Publisher = {MARY ANN LIEBERT, INC},
Year = {2014},
Month = {June},
url = {http://dx.doi.org/10.1089/big.2014.0018},
Abstract = {Our goal is to design a prediction and decision system for
real-time use during a professional car race. In designing a
knowledge discovery process for racing, we faced several
challenges that were overcome only when domain knowledge of
racing was carefully infused within statistical modeling
techniques. In this article, we describe how we leveraged
expert knowledge of the domain to produce a real-time
decision system for tire changes within a race. Our
forecasts have the potential to impact how racing teams can
optimize strategy by making tire-change decisions to benefit
their rank position. Our work significantly expands previous
research on sports analytics, as it is the only work on
analytical methods for within-race prediction and decision
making for professional car racing.},
Doi = {10.1089/big.2014.0018},
Key = {fds330647}
}
@article{fds330644,
Author = {Ertekin, S and Rudin, C and Hirsh, H},
Title = {Approximating the crowd},
Journal = {Data Mining and Knowledge Discovery},
Volume = {28},
Number = {5-6},
Pages = {1189-1221},
Publisher = {Springer Nature},
Year = {2014},
Month = {January},
url = {http://dx.doi.org/10.1007/s10618-014-0354-1},
Abstract = {The problem of "approximating the crowd" is that of
estimating the crowd's majority opinion by querying only a
subset of it. Algorithms that approximate the crowd can
intelligently stretch a limited budget for a crowdsourcing
task. We present an algorithm, "CrowdSense," that works in
an online fashion where items come one at a time. CrowdSense
dynamically samples subsets of the crowd based on an
exploration/exploitation criterion. The algorithm produces a
weighted combination of the subset's votes that approximates
the crowd's opinion. We then introduce two variations of
CrowdSense that make various distributional approximations
to handle distinct crowd characteristics. In particular, the
first algorithm makes a statistical independence
approximation of the labelers for large crowds, whereas the
second algorithm finds a lower bound on how often the
current subcrowd agrees with the crowd's majority vote. Our
experiments on CrowdSense and several baselines demonstrate
that we can reliably approximate the entire crowd's vote by
collecting opinions from a representative subset of the
crowd. © 2014 The Author(s).},
Doi = {10.1007/s10618-014-0354-1},
Key = {fds330644}
}
@article{fds330645,
Author = {Kim, B and Rudin, C},
Title = {Learning about meetings},
Journal = {Data Mining and Knowledge Discovery},
Volume = {28},
Number = {5-6},
Pages = {1134-1157},
Publisher = {Springer Nature},
Year = {2014},
Month = {January},
url = {http://dx.doi.org/10.1007/s10618-014-0348-z},
Abstract = {Most people participate in meetings almost every day,
multiple times a day. The study of meetings is important,
but also challenging, as it requires an understanding of
social signals and complex interpersonal dynamics. Our aim
in this work is to use a data-driven approach to the science
of meetings. We provide tentative evidence that: (i) it is
possible to automatically detect when during the meeting a
key decision is taking place, from analyzing only the local
dialogue acts, (ii) there are common patterns in the way
social dialogue acts are interspersed throughout a meeting,
(iii) at the time key decisions are made, the amount of time
left in the meeting can be predicted from the amount of time
that has passed, (iv) it is often possible to predict
whether a proposal during a meeting will be accepted or
rejected based entirely on the language (the set of
persuasive words) used by the speaker. © 2014 The
Author(s).},
Doi = {10.1007/s10618-014-0348-z},
Key = {fds330645}
}
@article{fds330646,
Author = {Rudin, C and Ertekin, S and Passonneau, R and Radeva, A and Tomar, A and Xie, B and Lewis, S and Riddle, M and Pangsrivinij, D and McCormick,
T},
Title = {Analytics for power grid distribution reliability in New
York City},
Journal = {Interfaces},
Volume = {44},
Number = {4},
Pages = {364-382},
Publisher = {Institute for Operations Research and the Management
Sciences (INFORMS)},
Year = {2014},
Month = {January},
url = {http://dx.doi.org/10.1287/inte.2014.0748},
Abstract = {We summarize the first major effort to use analytics for
preemptive maintenance and repair of an electrical
distribution network. This is a large-scale multiyear effort
between scientists and students at Columbia University and
the Massachusetts Institute of Technology and engineers from
the Consolidated Edison Company of New York (Con Edison),
which operates the world's oldest and largest underground
electrical system. Con Edison's preemptive maintenance
programs are less than a decade old and are made more
effective with the use of analytics developing alongside
them. Some of the data we used for our projects are
historical records dating as far back as the 1880s, and some
of the data are free-text documents typed by Con Edison
dispatchers. The operational goals of this work are to
assist with Con Edison's preemptive inspection and repair
program and its vented-cover replacement program. This has a
continuing impact on the public safety, operating costs, and
reliability of electrical service in New York City. © 2014
INFORMS.},
Doi = {10.1287/inte.2014.0748},
Key = {fds330646}
}
@article{fds330648,
Author = {Rudin, C and Wagstaff, KL},
Title = {Machine learning for science and society},
Journal = {Machine Learning},
Volume = {95},
Number = {1},
Pages = {1-9},
Publisher = {Springer Nature},
Year = {2014},
Month = {January},
url = {http://dx.doi.org/10.1007/s10994-013-5425-9},
Abstract = {The special issue on "Machine Learning for Science and
Society" showcases machine learning work with influence on
our current and future society. These papers address several
key problems such as how we perform repairs on critical
infrastructure, how we predict severe weather and aviation
turbulence, how we conduct tax audits, whether we can detect
privacy breaches in access to healthcare data, and how we
link individuals across census data sets for new insights
into population changes. In this introduction, we discuss
the need for such a special issue within the context of our
field and its relationship to the broader world. In the era
of "big data," there is a need for machine learning to
address important large-scale applied problems, yet it is
difficult to find top venues in machine learning where such
work is encouraged. We discuss the ramifications of this
contradictory situation and encourage further discussion on
the best strategy that we as a field may adopt. We also
summarize key lessons learned from individual papers in the
special issue so that the community as a whole can benefit.
© 2013 The Author(s).},
Doi = {10.1007/s10994-013-5425-9},
Key = {fds330648}
}
@article{fds330643,
Author = {Tulabandhula, T and Rudin, C},
Title = {On combining machine learning with decision
making},
Journal = {Machine Learning},
Volume = {97},
Number = {1-2},
Pages = {33-64},
Publisher = {Springer Nature},
Year = {2014},
Month = {January},
url = {http://dx.doi.org/10.1007/s10994-014-5459-7},
Abstract = {We present a new application and covering number bound for
the framework of "Machine Learning with Operational Costs
(MLOC)," which is an exploratory form of decision theory.
The MLOC framework incorporates knowledge about how a
predictive model will be used for a subsequent task, thus
combining machine learning with the decision that is made
afterwards. In this work, we use the MLOC framework to study
a problem that has implications for power grid reliability
and maintenance, called the Machine Learning and Traveling
Repairman Problem (ML&TRP). The goal of the ML&TRP is to
determine a route for a "repair crew," which repairs nodes
on a graph. The repair crew aims to minimize the cost of
failures at the nodes, but as in many real situations, the
failure probabilities are not known and must be estimated.
The MLOC framework allows us to understand how this
uncertainty influences the repair route. We also present new
covering number generalization bounds for the MLOC
framework. © 2014 The Author(s).},
Doi = {10.1007/s10994-014-5459-7},
Key = {fds330643}
}
@article{fds330650,
Author = {Huggins, JH and Rudin, C},
Title = {A statistical learning theory framework for supervised
pattern discovery},
Journal = {SIAM International Conference on Data Mining 2014, SDM
2014},
Volume = {1},
Pages = {506-514},
Publisher = {Society for Industrial and Applied Mathematics},
Year = {2014},
Month = {January},
ISBN = {9781510811515},
url = {http://dx.doi.org/10.1137/1.9781611973440.58},
Abstract = {This paper formalizes a latent variable inference problem we
call supervised, pattern discovery, the goal of which is to
find sets of observations that belong to a single "pattern."
We discuss two versions of the problem and prove uniform
risk bounds for both. In the first version, collections of
patterns can be generated in an arbitrary manner and the
data consist of multiple labeled collections. In the second
version, the patterns are assumed to be generated
independently by identically distributed processes. These
processes are allowed to take an arbitrary form, so
observations within a pattern are not in general independent
of each other. The bounds for the second version of the
problem are stated in terms of a new complexity measure, the
quasi-Rademacher complexity.},
Doi = {10.1137/1.9781611973440.58},
Key = {fds330650}
}
@article{fds330651,
Author = {Kim, B and Rudin, C and Shah, J},
Title = {The Bayesian case model: A generative approach for
case-based reasoning and prototype classification},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {3},
Number = {January},
Pages = {1952-1960},
Year = {2014},
Month = {January},
Abstract = {We present the Bayesian Case Model (BCM), a general
framework for Bayesian case-based reasoning (CBR) and
prototype classification and clustering. BCM brings the
intuitive power of CBR to a Bayesian generative framework.
The BCM learns prototypes, the "quintessential" observations
that best represent clusters in a dataset, by performing
joint inference on cluster labels, prototypes and important
features. Simultaneously, BCM pursues sparsity by learning
subspaces, the sets of features that play important roles in
the characterization of the prototypes. The prototype and
subspace representation provides quantitative benefits in
inter-pretability while preserving classification accuracy.
Human subject experiments verify statistically significant
improvements to participants' understanding when using
explanations produced by BCM, compared to those given by
prior art.},
Key = {fds330651}
}
@article{fds330652,
Author = {Goh, ST and Rudin, C},
Title = {Box drawings for learning with imbalanced
data},
Journal = {Proceedings of the ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining},
Pages = {333-342},
Publisher = {ACM Press},
Year = {2014},
Month = {January},
ISBN = {9781450329569},
url = {http://dx.doi.org/10.1145/2623330.2623648},
Abstract = {The vast majority of real world classification problems are
imbalanced, meaning there are far fewer data from the class
of interest (the positive class) than from other classes. We
propose two machine learning algorithms to handle highly
imbalanced classification problems. The classifiers are
disjunctions of conjunctions, and are created as unions of
parallel axis rectangles around the positive examples, and
thus have the benefit of being interpretable. The first
algorithm uses mixed integer programming to optimize a
weighted balance between positive and negative class
accuracies. Regularization is introduced to improve
generalization performance. The second method uses an
approximation in order to assist with scalability.
Specifically, it follows a \textit{characterize then
discriminate} approach, where the positive class is
characterized first by boxes, and then each box boundary
becomes a separate discriminative classifier. This method
has the computational advantages that it can be easily
parallelized, and considers only the relevant regions of
feature space. © 2014 ACM.},
Doi = {10.1145/2623330.2623648},
Key = {fds330652}
}
@article{fds330653,
Author = {Wang, D and Passonneau, RJ and Collins, M and Rudin,
C},
Title = {Modeling weather impact on a secondary electrical
grid},
Journal = {Procedia Computer Science},
Volume = {32},
Pages = {631-638},
Publisher = {Elsevier BV},
Year = {2014},
Month = {January},
url = {http://dx.doi.org/10.1016/j.procs.2014.05.470},
Abstract = {Weather can cause problems for underground electrical grids
by increasing the probability of serious "manhole events"
such as fires and explosions. In this work, we compare a
model that incorporates weather features associated with the
dates of serious events into a single logistic regression,
with a more complex approach that has three interdependent
log linear models for weather, baseline manhole
vulnerability, and vulnerability of manholes to weather. The
latter approach more naturally incorporates the dependencies
between the weather, structure properties, and structure
vulnerability. © 2014 Published by Elsevier
B.V.},
Doi = {10.1016/j.procs.2014.05.470},
Key = {fds330653}
}
@article{fds345400,
Author = {Tulabandhula, T and Rudin, C},
Title = {Robust optimization using machine learning for uncertainty
sets},
Journal = {International Symposium on Artificial Intelligence and
Mathematics, ISAIM 2014},
Year = {2014},
Month = {January},
Abstract = {© 2014 University of Illinois at Chicago. All rights
reserved. Our goal is to build robust optimization problems
that make decisions about the future, and where complex data
from the past are used to model uncertainty. In robust
optimization (RO) generally, the goal is to create a policy
for decision-making that is robust to our uncertainty about
the future. In particular, we want our policy to best handle
the the worst possible situation that could arise, out of an
uncertainty set of possible situations. Classically, the
uncertainty set is simply chosen by the user, or it might be
estimated in overly simplistic ways with strong assumptions;
whereas in this work, we learn the uncertainty set from
complex data from the past. The past data are drawn randomly
from an (unknown) possibly complicated high-dimensional
distribution. We propose a new uncertainty set design and
show how tools from statistical learning theory can be
employed to provide probabilistic guarantees on the
robustness of the policy.},
Key = {fds345400}
}
@article{fds345401,
Author = {Tulabandhula, T and Rudin, C},
Title = {Generalization bounds for learning with linear and quadratic
side knowledge},
Journal = {International Symposium on Artificial Intelligence and
Mathematics, ISAIM 2014},
Year = {2014},
Month = {January},
Abstract = {© 2014 University of Illinois at Chicago. All rights
reserved. In this paper, we consider a supervised learning
setting where side knowledge is provided about the labels of
unlabeled examples. The side knowledge has the effect of
reducing the hypothesis space, leading to tighter
generalization bounds, and thus possibly better
generalization. We consider two types of side knowledge, the
first leading to linear constraints on the hypothesis space,
and the second leading to quadratic constraints on the
hypothesis space. We show how different types of domain
knowledge can lead directly to these kinds of side
knowledge. We prove bounds on complexity measures of the
hypothesis space for quadratic side knowledge, and show that
these bounds are tight in a specific sense.},
Key = {fds345401}
}
@article{fds345402,
Author = {Huggins, J and Rudin, C},
Title = {Toward a theory of pattern discovery},
Journal = {International Symposium on Artificial Intelligence and
Mathematics, ISAIM 2014},
Year = {2014},
Month = {January},
Abstract = {© 2014 University of Illinois at Chicago. All rights
reserved. This paper formalizes a latent variable inference
problem we call supervised pattern discovery, the goal of
which is to find sets of observations that belong to a
single “pattern.” We discuss two versions of the problem
and prove uniform risk bounds for both. In the first
version, collections of patterns can be generated in an
arbitrary manner and the data consist of multiple labeled
collections. In the second version, the patterns are assumed
to be generated independently by identically distributed
processes. These processes are allowed to take an arbitrary
form, so observations within a pattern are not in general
independent of each other. The bounds for the second version
of the problem are stated in terms of a new complexity
measure, the quasi-Rademacher complexity.},
Key = {fds345402}
}
@article{fds349525,
Author = {Tulabandhula, T and Rudin, C},
Title = {Robust optimization using machine learning for uncertainty
sets},
Journal = {International Symposium on Artificial Intelligence and
Mathematics, ISAIM 2014},
Year = {2014},
Month = {January},
Abstract = {Our goal is to build robust optimization problems that make
decisions about the future, and where complex data from the
past are used to model uncertainty. In robust optimization
(RO) generally, the goal is to create a policy for
decision-making that is robust to our uncertainty about the
future. In particular, we want our policy to best handle the
the worst possible situation that could arise, out of an
uncertainty set of possible situations. Classically, the
uncertainty set is simply chosen by the user, or it might be
estimated in overly simplistic ways with strong assumptions;
whereas in this work, we learn the uncertainty set from
complex data from the past. The past data are drawn randomly
from an (unknown) possibly complicated high-dimensional
distribution. We propose a new uncertainty set design and
show how tools from statistical learning theory can be
employed to provide probabilistic guarantees on the
robustness of the policy.},
Key = {fds349525}
}
@article{fds349526,
Author = {Huggins, J and Rudin, C},
Title = {Toward a theory of pattern discovery},
Journal = {International Symposium on Artificial Intelligence and
Mathematics, ISAIM 2014},
Year = {2014},
Month = {January},
Abstract = {This paper formalizes a latent variable inference problem we
call supervised pattern discovery, the goal of which is to
find sets of observations that belong to a single
“pattern.” We discuss two versions of the problem and
prove uniform risk bounds for both. In the first version,
collections of patterns can be generated in an arbitrary
manner and the data consist of multiple labeled collections.
In the second version, the patterns are assumed to be
generated independently by identically distributed
processes. These processes are allowed to take an arbitrary
form, so observations within a pattern are not in general
independent of each other. The bounds for the second version
of the problem are stated in terms of a new complexity
measure, the quasi-Rademacher complexity.},
Key = {fds349526}
}
@article{fds349527,
Author = {Tulabandhula, T and Rudin, C},
Title = {Generalization bounds for learning with linear and quadratic
side knowledge},
Journal = {International Symposium on Artificial Intelligence and
Mathematics, ISAIM 2014},
Year = {2014},
Month = {January},
Abstract = {In this paper, we consider a supervised learning setting
where side knowledge is provided about the labels of
unlabeled examples. The side knowledge has the effect of
reducing the hypothesis space, leading to tighter
generalization bounds, and thus possibly better
generalization. We consider two types of side knowledge, the
first leading to linear constraints on the hypothesis space,
and the second leading to quadratic constraints on the
hypothesis space. We show how different types of domain
knowledge can lead directly to these kinds of side
knowledge. We prove bounds on complexity measures of the
hypothesis space for quadratic side knowledge, and show that
these bounds are tight in a specific sense.},
Key = {fds349527}
}
@article{fds330654,
Author = {Letham, B and Rudin, C and Heller, KA},
Title = {Growing a list},
Journal = {Data Mining and Knowledge Discovery},
Volume = {27},
Number = {3},
Pages = {372-395},
Year = {2013},
Month = {December},
url = {http://dx.doi.org/10.1007/s10618-013-0329-7},
Abstract = {It is easy to find expert knowledge on the Internet on
almost any topic, but obtaining a complete overview of a
given topic is not always easy: information can be scattered
across many sources and must be aggregated to be useful. We
introduce a method for intelligently growing a list of
relevant items, starting from a small seed of examples. Our
algorithm takes advantage of the wisdom of the crowd, in the
sense that there are many experts who post lists of things
on the Internet. We use a collection of simple machine
learning components to find these experts and aggregate
their lists to produce a single complete and meaningful
list. We use experiments with gold standards and open-ended
experiments without gold standards to show that our method
significantly outperforms the state of the art. Our method
uses the ranking algorithm Bayesian Sets even when its
underlying independence assumption is violated, and we
provide a theoretical generalization bound to motivate its
use. © 2013 The Author(s).},
Doi = {10.1007/s10618-013-0329-7},
Key = {fds330654}
}
@article{fds330655,
Author = {Rudin, C and Letham, B and Madigan, D},
Title = {Learning theory analysis for association rules and
sequential event prediction},
Journal = {Journal of Machine Learning Research},
Volume = {14},
Pages = {3441-3492},
Year = {2013},
Month = {November},
Abstract = {We present a theoretical analysis for prediction algorithms
based on association rules. As part of this analysis, we
introduce a problem for which rules are particularly
natural, called "sequential event prediction." In sequential
event prediction, events in a sequence are revealed one by
one, and the goal is to determine which event will next be
revealed. The training set is a collection of past sequences
of events. An example application is to predict which item
will next be placed into a customer's online shopping cart,
given his/her past purchases. In the context of this
problem, algorithms based on association rules have distinct
advantages over classical statistical and machine learning
methods: they look at correlations based on subsets of
co-occurring past events (items a and b imply item c), they
can be applied to the sequential event prediction problem in
a natural way, they can potentially handle the "cold start"
problem where the training set is small, and they yield
interpretable predictions. In this work, we present two
algorithms that incorporate association rules. These
algorithms can be used both for sequential event prediction
and for supervised classification, and they are simple
enough that they can possibly be understood by users,
customers, patients, managers, etc. We provide
generalization guarantees on these algorithms based on
algorithmic stability analysis from statistical learning
theory. We include a discussion of the strict minimum
support threshold often used in association rule mining, and
introduce an "adjusted confidence" measure that provides a
weaker minimum support condition that has advantages over
the strict minimum support. The paper brings together ideas
from statistical learning theory, association rule mining
and Bayesian analysis. © 2013 Cynthia Rudin, Benjamin
Letham and David Madigan.},
Key = {fds330655}
}
@article{fds330656,
Author = {Letham, B and Rudin, C and Madigan, D},
Title = {Sequential event prediction},
Journal = {Machine Learning},
Volume = {93},
Number = {2-3},
Pages = {357-380},
Publisher = {Springer Nature},
Year = {2013},
Month = {November},
url = {http://dx.doi.org/10.1007/s10994-013-5356-5},
Abstract = {In sequential event prediction, we are given a "sequence
database" of past event sequences to learn from, and we aim
to predict the next event within a current event sequence.
We focus on applications where the set of the past events
has predictive power and not the specific order of those
past events. Such applications arise in recommender systems,
equipment maintenance, medical informatics, and in other
domains. Our formalization of sequential event prediction
draws on ideas from supervised ranking. We show how specific
choices within this approach lead to different sequential
event prediction problems and algorithms. In recommender
system applications, the observed sequence of events depends
on user choices, which may be influenced by the
recommendations, which are themselves tailored to the user's
choices. This leads to sequential event prediction
algorithms involving a non-convex optimization problem. We
apply our approach to an online grocery store recommender
system, email recipient recommendation, and a novel
application in the health event prediction domain. © 2013
The Author(s).},
Doi = {10.1007/s10994-013-5356-5},
Key = {fds330656}
}
@article{fds330658,
Author = {Wang, T and Rudin, C and Wagner, D and Sevieri, R},
Title = {Learning to detect patterns of crime},
Journal = {Lecture Notes in Computer Science (including subseries
Lecture Notes in Artificial Intelligence and Lecture Notes
in Bioinformatics)},
Volume = {8190 LNAI},
Number = {PART 3},
Pages = {515-530},
Publisher = {Springer Berlin Heidelberg},
Year = {2013},
Month = {October},
ISBN = {9783642409936},
url = {http://dx.doi.org/10.1007/978-3-642-40994-3_33},
Abstract = {Our goal is to automatically detect patterns of crime. Among
a large set of crimes that happen every year in a major
city, it is challenging, time-consuming, and labor-intensive
for crime analysts to determine which ones may have been
committed by the same individual(s). If automated,
data-driven tools for crime pattern detection are made
available to assist analysts, these tools could help police
to better understand patterns of crime, leading to more
precise attribution of past crimes, and the apprehension of
suspects. To do this, we propose a pattern detection
algorithm called Series Finder, that grows a pattern of
discovered crimes from within a database, starting from a
"seed" of a few crimes. Series Finder incorporates both the
common characteristics of all patterns and the unique
aspects of each specific pattern, and has had promising
results on a decade's worth of crime pattern data collected
by the Crime Analysis Unit of the Cambridge Police
Department. © 2013 Springer-Verlag.},
Doi = {10.1007/978-3-642-40994-3_33},
Key = {fds330658}
}
@article{fds330659,
Author = {Mukherjee, I and Rudin, C and Schapire, RE},
Title = {The rate of convergence of AdaBoost},
Journal = {Journal of Machine Learning Research},
Volume = {14},
Pages = {2315-2347},
Year = {2013},
Month = {August},
Abstract = {The AdaBoost algorithm was designed to combine many "weak"
hypotheses that perform slightly better than random guessing
into a "strong" hypothesis that has very low error. We study
the rate at which AdaBoost iteratively converges to the
minimum of the "exponential loss." Unlike previous work, our
proofs do not require a weak-learning assumption, nor do
they require that minimizers of the exponential loss are
finite. Our first result shows that the exponential loss of
AdaBoost's computed parameter vector will be at most e more
than that of any parameter vector of l1-norm bounded by B in
a number of rounds that is at most a polynomial in B and
1/ε. We also provide lower bounds showing that a polynomial
dependence is necessary. Our second result is that within
C/ε iterations, AdaBoost achieves a value of the
exponential loss that is at most e more than the best
possible value, where C depends on the data set. We show
that this dependence of the rate on ε is optimal up to
constant factors, that is, at least Ω(1/ε) rounds are
necessary to achieve within e of the optimal exponential
loss. © 2013 Indraneel Mukherjee, Cynthia Rudin and Robert
E. Schapire.},
Key = {fds330659}
}
@article{fds330660,
Author = {Tulabandhula, T and Rudin, C},
Title = {Machine learning with operational costs},
Journal = {Journal of Machine Learning Research},
Volume = {14},
Pages = {1989-2028},
Year = {2013},
Month = {June},
Abstract = {This work proposes a way to align statistical modeling with
decision making. We provide a method that propagates the
uncertainty in predictive modeling to the uncertainty in
operational cost, where operational cost is the amount spent
by the practitioner in solving the problem. The method
allows us to explore the range of operational costs
associated with the set of reasonable statistical models, so
as to provide a useful way for practitioners to understand
uncertainty. To do this, the operational cost is cast as a
regularization term in a learning algorithm's objective
function, allowing either an optimistic or pessimistic view
of possible costs, depending on the regularization
parameter. From another perspective, if we have prior
knowledge about the operational cost, for instance that it
should be low, this knowledge can help to restrict the
hypothesis space, and can help with generalization. We
provide a theoretical generalization bound for this
scenario. We also show that learning with operational costs
is related to robust optimization. © 2013 Theja
Tulabandhula and Cynthia Rudin.},
Key = {fds330660}
}
@article{fds330661,
Author = {Ertekin, S and Rudin, C and McCormick, TH},
Title = {Predicting power failures with reactive point
processes},
Journal = {AAAI Workshop - Technical Report},
Volume = {WS-13-17},
Pages = {23-25},
Year = {2013},
Month = {January},
ISBN = {9781577356288},
Key = {fds330661}
}
@article{fds330662,
Author = {Kim, B and Rudin, C},
Title = {Machine learning for meeting analysis},
Journal = {AAAI Workshop - Technical Report},
Volume = {WS-13-17},
Pages = {59-61},
Year = {2013},
Month = {January},
ISBN = {9781577356288},
Abstract = {Most people participate in meetings almost every day,
multiple times a day. The study of meetings is important,
but also challenging, as it requires an understanding of
social signals and complex interpersonal dynamics. Our aim
this work is to use a data-driven approach to the science of
meetings. We provide tentative evidence that: i) there are
common macro-patterns in the way social dialogue acts are
interspersed throughout a meeting, and ii) it is often
possible to predict whether a proposal during a meeting will
be accepted or rejected based entirely on the language (the
set of persuasive words) used by the speaker. Copyright ©
2013, Association for the Advancement of Artificial
Intelligence. All rights reserved.},
Key = {fds330662}
}
@article{fds330663,
Author = {Letham, B and Rudin, C and McCormick, TH and Madigan,
D},
Title = {An interpretable stroke prediction model using rules and
Bayesian analysis},
Journal = {AAAI Workshop - Technical Report},
Volume = {WS-13-17},
Pages = {65-67},
Year = {2013},
Month = {January},
ISBN = {9781577356288},
Abstract = {We aim to produce predictive models that are not only
accurate, but are also interpretable to human experts. We
introduce a generative model called the Bayesian List
Machine for fitting decision lists, a type of interpretable
classifier, to data. We use the model to predict stroke in
atrial fibrillation patients, and produce predictive models
that are simple enough to be understood by patients yet
significantly outperform the medical scoring systems
currently in use.},
Key = {fds330663}
}
@article{fds330664,
Author = {Wang, T and Rudin, C and Wagner, D and Sevieri, R},
Title = {Detecting patterns of crime with Series Finder},
Journal = {AAAI Workshop - Technical Report},
Volume = {WS-13-17},
Pages = {140-142},
Year = {2013},
Month = {January},
ISBN = {9781577356288},
Abstract = {Many crimes can happen every day in a major city, and
figuring out which ones are committed by the same individual
or group is an important and difficult data mining
challenge. To do this, we propose a pattern detection
algorithm called Series Finder, that grows a pattern of
discovered crimes from within a database, starting from a
"seed" of a few crimes. Series Finder incorporates both the
common characteristics of all patterns and the unique
aspects of each specific pattern. We compared Series Finder
with classic clustering and classification models applied to
crime analysis. It has promising results on a decade's worth
of crime pattern data from the Cambridge Police Department.
Copyright © 2013, Association for the Advancement of
Artificial Intelligence. All rights reserved.},
Key = {fds330664}
}
@article{fds330665,
Author = {Ustun, B and Tracà, S and Rudin, C},
Title = {Supersparse linear integer models for predictive scoring
systems},
Journal = {AAAI Workshop - Technical Report},
Volume = {WS-13-17},
Pages = {128-130},
Year = {2013},
Month = {January},
ISBN = {9781577356288},
Key = {fds330665}
}
@article{fds330666,
Author = {Tulabandhula, T and Rudin, C},
Title = {The influence of operational cost on estimation},
Journal = {International Symposium on Artificial Intelligence and
Mathematics, ISAIM 2012},
Year = {2012},
Month = {December},
Abstract = {This work concerns the way that statistical models are used
to make decisions. In particular, we aim to merge the way
estimation algorithms are designed with how they are used
for a subsequent task. Our methodology considers the
operational cost of carrying out a policy, based on a
predictive model. The operational cost becomes a
regularization term in the learning algorithm's objective
function, allowing either an optimistic or pessimistic view
of possible costs. Limiting the operational cost reduces the
hypothesis space for the predictive model, and can thus
improve generalization. We show that different types of
operational problems can lead to the same type of
restriction on the hypothesis space, namely the restriction
to an intersection of an ℓq ball with a halfspace. We
bound the complexity of such hypothesis spaces by proposing
a technique that involves counting integer points in
polyhedrons.},
Key = {fds330666}
}
@article{fds330667,
Author = {Bertsimas, D and Chang, A and Rudin, C},
Title = {An integer optimization approach to associative
classification},
Journal = {Advances in Neural Information Processing
Systems},
Volume = {4},
Pages = {3302-3310},
Year = {2012},
Month = {December},
ISBN = {9781627480031},
Abstract = {We aim to design classifiers that have the interpretability
of association rules yet have predictive power on par with
the top machine learning algorithms for classification. We
propose a novel mixed integer optimization (MIO) approach
called Ordered Rules for Classification (ORC) for this task.
Our method has two parts. The first part mines a particular
frontier of solutions in the space of rules, and we show
that this frontier contains the best rules according to a
variety of interestingness measures. The second part learns
an optimal ranking for the rules to build a decision list
classifier that is simple and insightful. We report
empirical evidence using several different datasets to
demonstrate the performance of this method.},
Key = {fds330667}
}
@article{fds330668,
Author = {Ertekin, S and Hirsh, H and Rudin, C},
Title = {Selective sampling of labelers for approximating the
crowd},
Journal = {AAAI Fall Symposium - Technical Report},
Volume = {FS-12-06},
Pages = {7-13},
Year = {2012},
Month = {December},
Abstract = {In this paper, we present CrowdSense, an algorithm for
estimating the crowd's majority opinion by querying only a
subset of it. CrowdSense works in an online fashion where
examples come one at a time and it dynamically samples
subsets of labelers based on an exploration/exploitation
criterion. The algorithm produces a weighted combination of
a subset of the labelers' votes that approximates the
crowd's opinion. We also present two probabilistic variants
of CrowdSense mat are based on different assumptions on the
joint probability distribution between the labelers' votes
and the majority vote. Our experiments demonstrate that we
can reliably approximate the entire crowd's vote by
collecting opinions from a representative subset of the
crowd. Copyright © 2012, Association for the Advancement of
Artificial Intelligence (www.aaai.org). All rights
reserved.},
Key = {fds330668}
}
@article{fds330669,
Author = {Chang, A and Rudin, C and Cavaretta, M and Robert Thomas, and Gloria
Chou},
Title = {How to reverse-engineer quality rankings},
Journal = {Machine Learning},
Volume = {88},
Number = {3},
Pages = {369-398},
Publisher = {Springer Nature},
Year = {2012},
Month = {September},
url = {http://dx.doi.org/10.1007/s10994-012-5295-6},
Abstract = {A good or bad product quality rating can make or break an
organization. However, the notion of "quality" is often
defined by an independent rating company that does not make
the formula for determining the rank of a product publicly
available. In order to invest wisely in product development,
organizations are starting to use intelligent approaches for
determining how funding for product development should be
allocated. A critical step in this process is to
"reverse-engineer" a rating company's proprietary model as
closely as possible. In this work, we provide a machine
learning approach for this task, which optimizes a certain
rank statistic that encodes preference information specific
to quality rating data. We present experiments on data from
a major quality rating company, and provide new methods for
evaluating the solution. In addition, we provide an approach
to use the reverse-engineered model to achieve a top ranked
product in a cost-effective way. © The Author(s)
2012.},
Doi = {10.1007/s10994-012-5295-6},
Key = {fds330669}
}
@article{fds330670,
Author = {McCormick, TH and Rudin, C and Madigan, D},
Title = {Bayesian hierarchical rule modeling for predicting medical
conditions},
Journal = {Annals of Applied Statistics},
Volume = {6},
Number = {2},
Pages = {652-668},
Publisher = {Institute of Mathematical Statistics},
Year = {2012},
Month = {June},
url = {http://dx.doi.org/10.1214/11-AOAS522},
Abstract = {We propose a statistical modeling technique, called the
Hierarchical Association Rule Model (HARM), that predicts a
patient's possible future medical conditions given the
patient's current and past history of reported conditions.
The core of our technique is a Bayesian hierarchical model
for selecting predictive association rules (such as
"condition 1 and condition 2 → condition 3") from a large
set of candidate rules. Because this method "borrows
strength" using the conditions of many similar patients, it
is able to provide predictions specialized to any given
patient, even when little information about the patient's
history of conditions is available. © Institute of
Mathematical Statistics, 2012.},
Doi = {10.1214/11-AOAS522},
Key = {fds330670}
}
@article{fds330671,
Author = {Rudin, C and Waltz, D and Anderson, R and Boulanger, A and Salleb-Aouissi, A and Chow, M and Dutta, H and Gross, P and Huang, B and Ierome, S and Isaac, DF and Kressner, A and Passonneau, RJ and Radeva,
A and Wu, L},
Title = {Machine learning for the New York City power
grid},
Journal = {IEEE Transactions on Pattern Analysis and Machine
Intelligence},
Volume = {34},
Number = {2},
Pages = {328-345},
Publisher = {Institute of Electrical and Electronics Engineers
(IEEE)},
Year = {2012},
Month = {January},
url = {http://dx.doi.org/10.1109/TPAMI.2011.108},
Abstract = {Power companies can benefit from the use of knowledge
discovery methods and statistical machine learning for
preventive maintenance. We introduce a general process for
transforming historical electrical grid data into models
that aim to predict the risk of failures for components and
systems. These models can be used directly by power
companies to assist with prioritization of maintenance and
repair work. Specialized versions of this process are used
to produce 1) feeder failure rankings, 2) cable, joint,
terminator, and transformer rankings, 3) feeder Mean Time
Between Failure (MTBF) estimates, and 4) manhole events
vulnerability rankings. The process in its most general form
can handle diverse, noisy, sources that are historical
(static), semi-real-time, or real-time, incorporates
state-of-the-art machine learning algorithms for
prioritization (supervised ranking or MTBF), and includes an
evaluation of results via cross-validation and blind test.
Above and beyond the ranked lists and MTBF estimates are
business management interfaces that allow the prediction
capability to be integrated directly into corporate planning
and decision support; such interfaces rely on several
important properties of our general modeling approach: that
machine learning features are meaningful to domain experts,
that the processing of data is transparent, and that
prediction results are accurate enough to support sound
decision making. We discuss the challenges in working with
historical electrical grid data that were not designed for
predictive purposes. The rawness of these data contrasts
with the accuracy of the statistical models that can be
obtained from the process; these models are sufficiently
accurate to assist in maintaining New York City's electrical
grid. © 2011 IEEE.},
Doi = {10.1109/TPAMI.2011.108},
Key = {fds330671}
}
@article{fds330672,
Author = {Tulabandhula, T and Rudin, C and Jaillet, P},
Title = {The machine learning and traveling repairman
problem},
Journal = {Lecture Notes in Computer Science (including subseries
Lecture Notes in Artificial Intelligence and Lecture Notes
in Bioinformatics)},
Volume = {6992 LNAI},
Pages = {262-276},
Publisher = {Springer Berlin Heidelberg},
Year = {2011},
Month = {October},
ISBN = {9783642248726},
url = {http://dx.doi.org/10.1007/978-3-642-24873-3_20},
Abstract = {The goal of the Machine Learning and Traveling Repairman
Problem (ML&TRP) is to determine a route for a "repair
crew," which repairs nodes on a graph. The repair crew aims
to minimize the cost of failures at the nodes, but the
failure probabilities are not known and must be estimated.
If there is uncertainty in the failure probability
estimates, we take this uncertainty into account in an
unusual way; from the set of acceptable models, we choose
the model that has the lowest cost of applying it to the
subsequent routing task. In a sense, this procedure agrees
with a managerial goal, which is to show that the data can
support choosing a low-cost solution. © 2011
Springer-Verlag.},
Doi = {10.1007/978-3-642-24873-3_20},
Key = {fds330672}
}
@article{fds330673,
Author = {Ertekin, S and Rudin, C},
Title = {On equivalence relationships between classification and
ranking algorithms},
Journal = {Journal of Machine Learning Research},
Volume = {12},
Pages = {2905-2929},
Year = {2011},
Month = {October},
Abstract = {We demonstrate that there are machine learning algorithms
that can achieve success for two separate tasks
simultaneously, namely the tasks of classification and
bipartite ranking. This means that advantages gained from
solving one task can be carried over to the other task, such
as the ability to obtain conditional density estimates, and
an order-of-magnitude reduction in computational time for
training the algorithm. It also means that some algorithms
are robust to the choice of evaluation metric used; they can
theoretically perform well when performance is measured
either by a misclassification error or by a statistic of the
ROC curve (such as the area under the curve). Specifically,
we provide such an equivalence relationship between a
generalization of Freund et al.'s RankBoost algorithm,
called the "P-Norm Push," and a particular cost-sensitive
classification algorithm that generalizes AdaBoost, which we
call "P-Classification." We discuss and validate the
potential benefits of this equivalence relationship, and
perform controlled experiments to understand
P-Classification's empirical performance. There is no
established equivalence relationship for logistic regression
and its ranking counterpart, so we introduce a
logistic-regression-style algorithm that aims in between
classification and ranking, and has promising experimental
performance with respect to both tasks. © 2011 Şeyda
Ertekin and Cynthia Rudin.},
Key = {fds330673}
}
@article{fds330674,
Author = {Wu, L and Kaiser, G and Rudin, C and Anderson, R},
Title = {Data quality assurance and performance measurement of data
mining for preventive maintenance of power
grid},
Journal = {Proceedings of the 1st International Workshop on Data Mining
for Service and Maintenance, KDD4Service 2011 - Held in
Conjunction with SIGKDD'11},
Pages = {28-32},
Publisher = {ACM Press},
Year = {2011},
Month = {September},
ISBN = {9781450308427},
url = {http://dx.doi.org/10.1145/2018673.2018679},
Abstract = {Ensuring reliability as the electrical grid morphs into the
"smart grid" will require innovations in how we assess the
state of the grid, for the purpose of proactive maintenance,
rather than reactive maintenance; in the future, we will not
only react to failures, but also try to anticipate and avoid
them using predictive modeling (machine learning and data
mining) techniques. To help in meeting this challenge, we
present the Neutral Online Visualization-aided Autonomic
evaluation framework (NOVA) for evaluating machine learning
and data mining algorithms for preventive maintenance on the
electrical grid. NOVA has three stages provided through a
unified user interface: evaluation of input data quality,
evaluation of machine learning and data mining results, and
evaluation of the reliability improvement of the power grid.
A prototype version of NOVA has been deployed for the power
grid in New York City, and it is able to evaluate machine
learning and data mining systems effectively and
efficiently. Copyright 2011 ACM.},
Doi = {10.1145/2018673.2018679},
Key = {fds330674}
}
@article{fds330675,
Author = {Wu, L and Teravainen, T and Kaiser, G and Anderson, R and Boulanger, A and Rudin, C},
Title = {Estimation of system reliability using a semiparametric
model},
Journal = {IEEE 2011 EnergyTech, ENERGYTECH 2011},
Publisher = {IEEE},
Year = {2011},
Month = {August},
ISBN = {9781457707773},
url = {http://dx.doi.org/10.1109/EnergyTech.2011.5948537},
Abstract = {An important problem in reliability engineering is to
predict the failure rate, that is, the frequency with which
an engineered system or component fails. This paper presents
a new method of estimating failure rate using a
semiparametric model with Gaussian process smoothing. The
method is able to provide accurate estimation based on
historical data and it does not make strong a priori
assumptions of failure rate pattern (e.g., constant or
monotonic). Our experiments of applying this method in power
system failure data compared with other models show its
efficacy and accuracy. This method can be used in estimating
reliability for many other systems, such as software systems
or components. © 2011 IEEE.},
Doi = {10.1109/EnergyTech.2011.5948537},
Key = {fds330675}
}
@article{fds330676,
Author = {Rudin, C and Letham, B and Kogan, E and Madigan, D},
Title = {A Learning Theory Framework for Association Rules and
Sequential Events},
Year = {2011},
Month = {June},
Key = {fds330676}
}
@article{fds330677,
Author = {Rudin, C and Passonneau, RJ and Radeva, A and Ierome, S and Isaac,
DF},
Title = {21st-century data miners meet 19th-century electrical
cables},
Journal = {Computer},
Volume = {44},
Number = {6},
Pages = {103-105},
Publisher = {Institute of Electrical and Electronics Engineers
(IEEE)},
Year = {2011},
Month = {June},
url = {http://dx.doi.org/10.1109/MC.2011.164},
Abstract = {Researchers can repurpose even extremely raw historical data
for use in prediction. © 2006 IEEE.},
Doi = {10.1109/MC.2011.164},
Key = {fds330677}
}
@article{fds330678,
Author = {McCormick, T and Rudin, C and Madigan, D},
Title = {A Hierarchical Model for Association Rule Mining of
Sequential Events: An Approach to Automated Medical Symptom
Prediction},
Year = {2011},
Month = {January},
Key = {fds330678}
}
@article{fds330679,
Author = {Rudin, C and Letham, B and Salleb-Aouissi, A and Kogan, E and Madigan,
D},
Title = {Sequential event prediction with association
rules},
Journal = {Journal of Machine Learning Research},
Volume = {19},
Pages = {615-634},
Year = {2011},
Month = {January},
Abstract = {We consider a supervised learning problem in which data are
revealed sequentially and the goal is to determine what will
next be revealed. In the context of this problem, algorithms
based on association rules have a distinct advantage over
classical statistical and machine learning methods; however,
there has not previously been a theoretical foundation
established for using association rules in supervised
learning. We present two simple algorithms that incorporate
association rules, and provide generalization guarantees on
these algorithms based on algorithmic stability analysis
from statistical learning theory. We include a discussion of
the strict minimum support threshold often used in
association rule mining, and introduce an "adjusted
confidence" measure that provides a weaker minimum support
condition that has advantages over the strict minimum
support. The paper brings together ideas from statistical
learning theory, association rule mining and Bayesian
analysis. © 2011 C. Rudin, B. Letham, A. Salleb-Aouissi, E.
Kogan & D. Madigan.},
Key = {fds330679}
}
@article{fds330680,
Author = {Mukherjee, I and Rudin, C and Schapire, RE},
Title = {The rate of convergence of AdaBoost},
Journal = {Journal of Machine Learning Research},
Volume = {19},
Pages = {537-557},
Year = {2011},
Month = {January},
Abstract = {The AdaBoost algorithm of Freund and Schapire (1997) was
designed to combine many "weak" hypotheses that perform
slightly better than a random guess into a "strong"
hypo-thesis that has very low error. We study the rate at
which AdaBoost iteratively converges to the minimum of the
"exponential loss" with a fast rate of convergence. Our
proofs do not require a weak-learning assumption, nor do
they require that minimizers of the exponential loss are
finite. Specifically, our first result shows that at
iteration t, the exponential loss of AdaBoost 's computed
parameter vector will be at most ε more than that of any
parameter vector of ℓ1- norm bounded by B in a number of
rounds that is bounded by a polynomial in B and 1/ε. We
also provide rate lower bound examples showing a polynomial
dependence on these parameters is necessary. Our second
result is that within C/ε iterations, AdaBoost achieves a
value of the exponential loss that is at most ε more than
the best possible value, where C depends on the dataset. We
show that this dependence of the rate on ε is optimal up to
constant factors, i.e. at least Ω( 1/ε) rounds are
necessary to achieve within ε of the optimal exponential
loss. © 2011 I. Mukherjee, C. Rudin & R.E.
Schapire.},
Key = {fds330680}
}
@article{fds330681,
Author = {Rudin, C and Passonneau, RJ and Radeva, A and Dutta, H and Ierome, S and Isaac, D},
Title = {A process for predicting manhole events in
Manhattan},
Journal = {Machine Learning},
Volume = {80},
Number = {1},
Pages = {1-31},
Publisher = {Springer Nature},
Year = {2010},
Month = {July},
url = {http://dx.doi.org/10.1007/s10994-009-5166-y},
Abstract = {We present a knowledge discovery and data mining process
developed as part of the Columbia/Con Edison project on
manhole event prediction. This process can assist with
real-world prioritization problems that involve raw data in
the form of noisy documents requiring significant amounts of
pre-processing. The documents are linked to a set of
instances to be ranked according to prediction criteria. In
the case of manhole event prediction, which is a new
application for machine learning, the goal is to rank the
electrical grid structures in Manhattan (manholes and
service boxes) according to their vulnerability to serious
manhole events such as fires, explosions and smoking
manholes. Our ranking results are currently being used to
help prioritize repair work on the Manhattan electrical
grid. © 2010 The Author(s).},
Doi = {10.1007/s10994-009-5166-y},
Key = {fds330681}
}
@article{fds330682,
Author = {Pelossof, R and Jones, M and Vovsha, I and Rudin,
C},
Title = {Online coordinate boosting},
Journal = {2009 IEEE 12th International Conference on Computer Vision
Workshops, ICCV Workshops 2009},
Pages = {1354-1361},
Year = {2009},
Month = {December},
ISBN = {9781424444427},
url = {http://dx.doi.org/10.1109/ICCVW.2009.5457454},
Abstract = {We present a new online boosting algorithm for updating the
weights of a boosted classifier, which yields a closer
approximation to the edges found by Freund and Schapire's
AdaBoost algorithm than previous online boosting algorithms.
We contribute a new way of deriving the online algorithm
that ties together previous online boosting work. The online
algorithm is derived by minimizing AdaBoost's loss as a
single example is added to the training set. The equations
show that the optimization is computationally expensive.
However, a fast online approximation is possible. We compare
approximation error to edges found by batch AdaBoost on
synthetic datasets and generalization error on face datasets
and the MNIST dataset. ©2009 IEEE.},
Doi = {10.1109/ICCVW.2009.5457454},
Key = {fds330682}
}
@article{fds330683,
Author = {Radeva, A and Rudin, C and Passonneau, R and Isaac,
D},
Title = {Report cards for manholes: Eliciting expert feedback for a
learning task},
Journal = {8th International Conference on Machine Learning and
Applications, ICMLA 2009},
Pages = {719-724},
Publisher = {IEEE},
Year = {2009},
Month = {December},
ISBN = {9780769539263},
url = {http://dx.doi.org/10.1109/ICMLA.2009.72},
Abstract = {We present a manhole profiling tool, developed as part of
the Columbia/Con Edison machine learning project on manhole
event prediction, and discuss its role in evaluating our
machine learning model in three important ways: elimination
of outliers, elimination of falsely predictive features, and
assessment of the quality of the model. The model produces a
ranked list of tens of thousands of manholes in Manhattan,
where the ranking criterion is vulnerability to serious
events such as fires, explosions and smoking manholes. Con
Edison set two goals for the model, namely accuracy and
intuitiveness, and this tool made it possible for us to
address both of these goals. The tool automatically
assembles a "report card or "profile" highlighting data
associated with a given manhole. Prior to the processing
work that underlies the profiling tool, case studies of a
single manhole took several days and resulted in an
incomplete study; locating manholes such as those we present
in this work would have been extremely difficult. The model
is currently assisting Con Edison in determining repair
priorities for the secondary electrical grid. © 2009
IEEE.},
Doi = {10.1109/ICMLA.2009.72},
Key = {fds330683}
}
@article{fds330684,
Author = {Rudin, C and Schapire, RE},
Title = {Margin-based ranking and an equivalence between AdaBoost and
RankBoost},
Journal = {Journal of Machine Learning Research},
Volume = {10},
Pages = {2193-2232},
Year = {2009},
Month = {November},
Abstract = {We study boosting algorithms for learning to rank. We give a
general margin-based bound for ranking based on covering
numbers for the hypothesis space. Our bound suggests that
algorithms that maximize the ranking margin will generalize
well. We then describe a new algorithm, smooth margin
ranking, that precisely converges to a maximum
ranking-margin solution. The algorithm is a modification of
RankBoost, analogous to "approximate coordinate ascent
boosting." Finally, we prove that AdaBoost and RankBoost are
equally good for the problems of bipartite ranking and
classification in terms of their asymptotic behavior on the
training set. Under natural conditions, AdaBoost achieves an
area under the ROC curve that is equally as good as
RankBoost's; furthermore, RankBoost, when given a specific
intercept, achieves a misclassification error that is as
good as AdaBoost's. This may help to explain the empirical
observations made by Cortes and Mohri, and Caruana and
Niculescu-Mizil, about the excellent performance of AdaBoost
as a bipartite ranking algorithm, as measured by the area
under the ROC curve. © 2009 Cynthia Rudin and Robert E.
Schapire.},
Key = {fds330684}
}
@article{fds330685,
Author = {Rudin, C},
Title = {The P-norm push: A simple convex ranking algorithm that
concentrates at the top of the list},
Journal = {Journal of Machine Learning Research},
Volume = {10},
Pages = {2233-2271},
Year = {2009},
Month = {November},
Abstract = {We are interested in supervised ranking algorithms that
perform especially well near the top of the ranked list, and
are only required to perform sufficiently well on the rest
of the list. In this work, we provide a general form of
convex objective that gives high-scoring examples more
importance. This "push" near the top of the list can be
chosen arbitrarily large or small, based on the preference
of the user. We choose ℓp-norms to provide a specific type
of push; if the user sets p larger, the objective
concentrates harder on the top of the list. We derive a
generalization bound based on the p-norm objective, working
around the natural asymmetry of the problem. We then derive
a boosting-style algorithm for the problem of ranking with a
push at the top. The usefulness of the algorithm is
illustrated through experiments on repository data. We prove
that the minimizer of the algorithm's objective is unique in
a specific sense. Furthermore, we illustrate how our
objective is related to quality measurements for information
retrieval. © 2009 Cynthia Rudin.},
Key = {fds330685}
}
@article{fds330686,
Author = {Passonneau, RJ and Rudin, C and Radeva, A and Liu,
ZA},
Title = {Reducing noise in labels and features for a real world
dataset: Application of NLP corpus annotation
methods},
Journal = {Lecture Notes in Computer Science (including subseries
Lecture Notes in Artificial Intelligence and Lecture Notes
in Bioinformatics)},
Volume = {5449 LNCS},
Pages = {86-97},
Publisher = {Springer Berlin Heidelberg},
Year = {2009},
Month = {July},
ISBN = {9783642003813},
url = {http://dx.doi.org/10.1007/978-3-642-00382-0_7},
Abstract = {This paper illustrates how a combination of information
extraction, machine learning, and NLP corpus annotation
practice was applied to a problem of ranking vulnerability
of structures (service boxes, manholes) in the Manhattan
electrical grid. By adapting NLP corpus annotation methods
to the task of knowledge transfer from domain experts, we
compensated for the lack of operational definitions of
components of the model, such as serious event. The machine
learning depended on the ticket classes, but it was not the
end goal. Rather, our rule-based document classification
determines both the labels of examples and their feature
representations. Changes in our classification of events led
to improvements in our model, as reflected in the AUC scores
for the full ranked list of over 51K structures. The
improvements for the very top of the ranked list, which is
of most importance for prioritizing work on the electrical
grid, affected one in every four or five structures. ©
Springer-Verlag Berlin Heidelberg 2009.},
Doi = {10.1007/978-3-642-00382-0_7},
Key = {fds330686}
}
@article{fds330687,
Author = {Roth, R and Rambow, O and Habash, N and Diab, M and Rudin,
C},
Title = {Arabic morphological tagging, diacritization, and
lemmatization using lexeme models and feature
ranking},
Journal = {ACL-08: HLT - 46th Annual Meeting of the Association for
Computational Linguistics: Human Language Technologies,
Proceedings of the Conference},
Pages = {117-120},
Year = {2008},
Month = {January},
ISBN = {9781932432046},
url = {http://dx.doi.org/10.3115/1557690.1557721},
Abstract = {We investigate the tasks of general morphological tagging,
diacritization, and lemmatization for Arabic. We show that
for all tasks we consider, both modeling the lexeme
explicitly, and retuning the weights of individual
classifiers for the specific task, improve the performance.
© 2008 Association for Computational Linguistics.},
Doi = {10.3115/1557690.1557721},
Key = {fds330687}
}
@article{fds369839,
Author = {Roth, R and Rambow, O and Habash, N and Diab, M and Rudin,
C},
Title = {Arabic morphological tagging, diacritization, and
lemmatization using lexeme models and feature
ranking},
Journal = {Proceedings of the Annual Meeting of the Association for
Computational Linguistics},
Pages = {117-120},
Year = {2008},
Month = {January},
Abstract = {We investigate the tasks of general morphological tagging,
diacritization, and lemmatization for Arabic. We show that
for all tasks we consider, both modeling the lexeme
explicitly, and retuning the weights of individual
classifiers for the specific task, improve the
performance.},
Key = {fds369839}
}
@article{fds330688,
Author = {Rudin, C and Schapire, RE and Daubechies, I},
Title = {Analysis of boosting algorithms using the smooth margin
function},
Journal = {Annals of Statistics},
Volume = {35},
Number = {6},
Pages = {2723-2768},
Publisher = {Institute of Mathematical Statistics},
Year = {2007},
Month = {December},
url = {http://dx.doi.org/10.1214/009053607000000785},
Abstract = {We introduce a useful tool for analyzing boosting algorithms
called the "smooth margin function," a differentiable
approximation of the usual margin for boosting algorithms.
We present two boosting algorithms based on this smooth
margin, "coordinate ascent boosting" and "approximate
coordinate ascent boosting," which are similar to Freund and
Schapire's AdaBoost algorithm and Breiman's arc-gv
algorithm. We give convergence rates to the maximum margin
solution for both of our algorithms and for arc-gv. We then
study AdaBoost's convergence properties using the smooth
margin function. We precisely bound the margin attained by
AdaBoost when the edges of the weak classifiers fall within
a specified range. This shows that a previous bound proved
by Ratsch and Warmuth is exactly tight. Furthermore, we use
the smooth margin to capture explicit properties of AdaBoost
in cases where cyclic behavior occurs. © Institute of
Mathematical Statistics, 2007.},
Doi = {10.1214/009053607000000785},
Key = {fds330688}
}
@article{fds330689,
Author = {Rudin, C},
Title = {Ranking with a P-norm push},
Journal = {Lecture Notes in Computer Science (including subseries
Lecture Notes in Artificial Intelligence and Lecture Notes
in Bioinformatics)},
Volume = {4005 LNAI},
Pages = {589-604},
Year = {2006},
Month = {January},
ISBN = {9783540352945},
url = {http://dx.doi.org/10.1007/11776420_43},
Abstract = {We are interested in supervised ranking with the following
twist: our goal is to design algorithms that perform
especially well near the top of the ranked list, and are
only required to perform sufficiently well on the rest of
the list. Towards this goal, we provide a general form of
convex objective that gives high-scoring examples more
importance. This "push" near the top of the list can be
chosen to be arbitrarily large or small. We choose
ℓp-norms to provide a specific type of push; as p becomes
large, the algorithm concentrates harder near the top of the
list. We derive a generalization bound based on the p-norm
objective. We then derive a corresponding boosting-style
algorithm, and illustrate the usefulness of the algorithm
through experiments on UCI data. We prove that the minimizer
of the objective is unique in a specific sense. ©
Springer-Verlag Berlin Heidelberg 2006.},
Doi = {10.1007/11776420_43},
Key = {fds330689}
}
@article{fds376879,
Author = {Ji, H and Rudin, C and Grishman, R},
Title = {Re-Ranking Algorithms for Name Tagging},
Journal = {HLT-NAACL 2006 - Computationally Hard Problems and Joint
Inference in Speech and Language Processing, Proceedings of
the Workshop},
Pages = {49-56},
Year = {2006},
Month = {January},
Abstract = {Integrating information from different stages of an NLP
processing pipeline can yield significant error reduction.
We demonstrate how re-ranking can improve name tagging in a
Chinese information extraction system by incorporating
information from relation extraction, event extraction, and
coreference. We evaluate three state-of-the-art re-ranking
algorithms (MaxEnt-Rank, SVMRank, and p-Norm Push Ranking),
and show the benefit of multi-stage re-ranking for
cross-sentence and cross-document inference.},
Key = {fds376879}
}
@article{fds330690,
Author = {Rudin, C and Cortes, C and Mohri, M and Schapire,
RE},
Title = {Margin-based ranking meets boosting in the
middle},
Journal = {Lecture Notes in Computer Science (including subseries
Lecture Notes in Artificial Intelligence and Lecture Notes
in Bioinformatics)},
Volume = {3559 LNAI},
Pages = {63-78},
Publisher = {Springer Berlin Heidelberg},
Year = {2005},
Month = {January},
ISBN = {9783540265566},
url = {http://dx.doi.org/10.1007/11503415_5},
Abstract = {We present several results related to ranking. We give a
general margin-based bound for ranking based on the L∞
covering number of the hypothesis space. Our bound suggests
that algorithms that maximize the ranking margin generalize
well. We then describe a new algorithm, Smooth Margin
Ranking, that precisely converges to a maximum
ranking-margin solution. The algorithm is a modification of
RankBoost, analogous to Approximate Coordinate Ascent
Boosting. We also prove a remarkable property of AdaBoost:
under very natural conditions, AdaBoost maximizes the
exponentiated loss associated with the AUC and achieves the
same AUC as RankBoost. This explains the empirical
observations made by Cortes and Mohri, and Caruana and
Niculescu-Mizil, about the excellent performance of AdaBoost
as a ranking algorithm, as measured by the AUC. ©
Springer-Verlag Berlin Heidelberg 2005.},
Doi = {10.1007/11503415_5},
Key = {fds330690}
}
@article{fds330692,
Author = {Rudin, C and Daubechies, I and Schapire, RE},
Title = {The dynamics of AdaBoost: Cyclic behavior and convergence of
margins},
Journal = {Journal of Machine Learning Research},
Volume = {5},
Pages = {1557-1595},
Year = {2004},
Month = {December},
Abstract = {In order to study the convergence properties of the AdaBoost
algorithm, we reduce AdaBoost to a nonlinear iterated map
and study the evolution of its weight vectors. This
dynamical systems approach allows us to understand
AdaBoost's convergence properties completely in certain
cases; for these cases we find stable cycles, allowing us to
explicitly solve for AdaBoost's output. Using this unusual
technique, we are able to show that AdaBoost does not always
converge to a maximum margin combined classifier, answering
an open question. In addition, we show that "nonoptimal"
AdaBoost (where the weak learning algorithm does not
necessarily choose the best weak classifier at each
iteration) may fail to converge to a maximum margin
classifier, even if "optimal" AdaBoost produces a maximum
margin. Also, we show that if AdaBoost cycles, it cycles
among "support vectors", i.e., examples that achieve the
same smallest margin.},
Key = {fds330692}
}
@article{fds330691,
Author = {Rudin, C and Schapire, RE and Daubechies, I},
Title = {Boosting based on a smooth margin},
Journal = {Lecture Notes in Artificial Intelligence (Subseries of
Lecture Notes in Computer Science)},
Volume = {3120},
Pages = {502-517},
Publisher = {Springer Berlin Heidelberg},
Year = {2004},
Month = {January},
url = {http://dx.doi.org/10.1007/978-3-540-27819-1_35},
Abstract = {We study two boosting algorithms, Coordinate Ascent Boosting
and Approximate Coordinate Ascent Boosting, which are
explicitly designed to produce maximum margins. To derive
these algorithms, we introduce a smooth approximation of the
margin that one can maximize in order to produce a maximum
margin classifier. Our first algorithm is simply coordinate
ascent on this function, involving a line search at each
step. We then make a simple approximation of this line
search to reveal our second algorithm. These algorithms are
proven to asymptotically achieve maximum margins, and we
provide two convergence rate calculations. The second
calculation yields a faster rate of convergence than the
first, although the first gives a more explicit (still fast)
rate. These algorithms are very similar to AdaBoost in that
they are based on coordinate ascent, easy to implement, and
empirically tend to converge faster than other boosting
algorithms. Finally, we attempt to understand AdaBoost in
terms of our smooth margin, focusing on cases where AdaBoost
exhibits cyclic behavior.},
Doi = {10.1007/978-3-540-27819-1_35},
Key = {fds330691}
}
@article{fds330693,
Author = {Rudin, C and Daubechies, I and Schapire, RE},
Title = {On the dynamics of boosting},
Journal = {Advances in Neural Information Processing
Systems},
Pages = {1101-1108},
Publisher = {M I T PRESS},
Editor = {Thrun, S and Saul, LK and Schölkopf, B},
Year = {2004},
Month = {January},
ISBN = {9780262201520},
Abstract = {In order to understand AdaBoost's dynamics, especially its
ability to maximize margins, we derive an associated
simplified nonlinear iterated map and analyze its behavior
in low-dimensional cases. We find stable cycles for these
cases, which can explicitly be used to solve for Ada-
Boost's output. By considering AdaBoost as a dynamical
system, we are able to prove R̈atsch and Warmuth's
conjecture that AdaBoost may fail to converge to a
maximal-margin combined classifier when given a 'nonoptimal'
weak learning algorithm. AdaBoost is known to be a
coordinate descent method, but other known algorithms that
explicitly aim to maximize the margin (such as AdaBoost and
arc-gv) are not. We consider a differentiable function for
which coordinate ascent will yield a maximum margin
solution. We then make a simple approximation to derive a
new boosting algorithm whose updates are slightly more
aggressive than those of arcgv.},
Key = {fds330693}
}
|