Publications of Cynthia D. Rudin

Department of Mathematics
Search | Help | Login |
Math @ Duke
Math HOME
.......................
Faculty
.......................
Webpage
News
Publications
Publications of Cynthia D. Rudin :chronological alphabetical combined listing:

%% Books   
@book{fds330642,
   Author = {Rudin, C},
   Title = {Turning prediction tools into decision tools},
   Volume = {9356},
   Year = {2015},
   Month = {January},
   ISBN = {9783319242811},
   Key = {fds330642}
}


%% Papers Published   
@article{fds374391,
   Author = {Falcinelli, SD and Cooper-Volkheimer, AD and Semenova, L and Wu, E and Richardson, A and Ashokkumar, M and Margolis, DM and Archin, NM and Rudin, CD and Murdoch, D and Browne, EP},
   Title = {Impact of Cannabis Use on Immune Cell Populations and the
             Viral Reservoir in People With HIV on Suppressive
             Antiretroviral Therapy.},
   Journal = {J Infect Dis},
   Volume = {228},
   Number = {11},
   Pages = {1600-1609},
   Year = {2023},
   Month = {November},
   url = {http://dx.doi.org/10.1093/infdis/jiad364},
   Abstract = {BACKGROUND: Human immunodeficiency virus (HIV) infection
             remains incurable due to the persistence of a viral
             reservoir despite antiretroviral therapy (ART). Cannabis
             (CB) use is prevalent amongst people with HIV (PWH), but the
             impact of CB on the latent HIV reservoir has not been
             investigated. METHODS: Peripheral blood cells from a cohort
             of PWH who use CB and a matched cohort of PWH who do not use
             CB on ART were evaluated for expression of
             maturation/activation markers, HIV-specific T-cell
             responses, and intact proviral DNA. RESULTS: CB use was
             associated with increased abundance of naive T cells,
             reduced effector T cells, and reduced expression of
             activation markers. CB use was also associated with reduced
             levels of exhausted and senescent T cells compared to
             nonusing controls. HIV-specific T-cell responses were
             unaffected by CB use. CB use was not associated with intact
             or total HIV DNA frequency in CD4 T cells. CONCLUSIONS: This
             analysis is consistent with the hypothesis that CB use
             reduces activation, exhaustion, and senescence in the T
             cells of PWH, and does not impair HIV-specific CD8 T-cell
             responses. Longitudinal and interventional studies with
             evaluation of CB exposure are needed to fully evaluate the
             impact of CB use on the HIV reservoir.},
   Doi = {10.1093/infdis/jiad364},
   Key = {fds374391}
}

@article{fds373363,
   Author = {Garrett, BL and Rudin, C},
   Title = {Interpretable algorithmic forensics.},
   Journal = {Proceedings of the National Academy of Sciences of the
             United States of America},
   Volume = {120},
   Number = {41},
   Pages = {e2301842120},
   Year = {2023},
   Month = {October},
   url = {http://dx.doi.org/10.1073/pnas.2301842120},
   Abstract = {One of the most troubling trends in criminal investigations
             is the growing use of "black box" technology, in which law
             enforcement rely on artificial intelligence (AI) models or
             algorithms that are either too complex for people to
             understand or they simply conceal how it functions. In
             criminal cases, black box systems have proliferated in
             forensic areas such as DNA mixture interpretation, facial
             recognition, and recidivism risk assessments. The champions
             and critics of AI argue, mistakenly, that we face a catch
             22: While black box AI is not understandable by people, they
             assume that it produces more accurate forensic evidence. In
             this Article, we question this assertion, which has so
             powerfully affected judges, policymakers, and academics. We
             describe a mature body of computer science research showing
             how "glass box" AI-designed to be interpretable-can be more
             accurate than black box alternatives. Indeed, black box AI
             performs predictably <i>worse</i> in settings like the
             criminal system. Debunking the black box performance myth
             has implications for forensic evidence, constitutional
             criminal procedure rights, and legislative policy. Absent
             some compelling-or even credible-government interest in
             keeping AI as a black box, and given the constitutional
             rights and public safety interests at stake, we argue that a
             substantial burden rests on the government to justify black
             box AI in criminal cases. We conclude by calling for
             judicial rulings and legislation to safeguard a right to
             interpretable forensic AI.},
   Doi = {10.1073/pnas.2301842120},
   Key = {fds373363}
}

@article{fds372809,
   Author = {Hahn, S and Zhu, R and Mak, S and Rudin, C and Jiang,
             Y},
   Title = {An Interpretable, Flexible, and Interactive Probabilistic
             Framework for Melody Generation},
   Journal = {Proceedings of the ACM SIGKDD International Conference on
             Knowledge Discovery and Data Mining},
   Pages = {4089-4099},
   Year = {2023},
   Month = {August},
   ISBN = {9798400701030},
   url = {http://dx.doi.org/10.1145/3580305.3599772},
   Abstract = {The fast-growing demand for algorithmic music generation is
             found throughout entertainment, art, education, etc.
             Unfortunately, most recent models are practically impossible
             to interpret or musically fine-tune, as they use deep neural
             networks with thousands of parameters. We introduce an
             interpretable, flexible, and interactive model,
             SchenkComposer, for melody generation that empowers users to
             be creative in all aspects of the music generation pipeline
             and allows them to learn from the process. We divide the
             task of melody generation into steps based on the process
             that a human composer using music-theoretical domain
             knowledge might use. First, the model determines phrase
             structure based on form analysis and identifies an
             appropriate number of measures. Using concepts from
             Schenkerian analysis, the model then finds a fitting
             harmonic rhythm, middleground harmonic progression,
             foreground rhythm, and melody in a hierarchical, scaffolded
             approach using a probabilistic context-free grammar based on
             musical contours. By incorporating theories of musical form
             and harmonic structure, our model produces music with
             long-term structural coherence. In extensive human
             experiments, we find that music generated with our approach
             successfully passes a Turing test in human experiments while
             current state-of-the-art approaches fail, and we further
             demonstrate superior performance and preference for our
             melodies compared to existing melody generation methods.
             Additionally, we developed and deployed a public website for
             SchenkComposer, and conducted preliminary user surveys.
             Through analysis, we show the strong viability and
             enjoyability of SchenkComposer.},
   Doi = {10.1145/3580305.3599772},
   Key = {fds372809}
}

@article{fds371244,
   Author = {Parikh, H and Hoffman, K and Sun, H and Zafar, SF and Ge, W and Jing, J and Liu, L and Sun, J and Struck, A and Volfovsky, A and Rudin, C and Westover,
             MB},
   Title = {Effects of epileptiform activity on discharge outcome in
             critically ill patients in the USA: a retrospective
             cross-sectional study.},
   Journal = {The Lancet. Digital health},
   Volume = {5},
   Number = {8},
   Pages = {e495-e502},
   Year = {2023},
   Month = {August},
   url = {http://dx.doi.org/10.1016/s2589-7500(23)00088-2},
   Abstract = {<h4>Background</h4>Epileptiform activity is associated with
             worse patient outcomes, including increased risk of
             disability and death. However, the effect of epileptiform
             activity on neurological outcome is confounded by the
             feedback between treatment with antiseizure medications and
             epileptiform activity burden. We aimed to quantify the
             heterogeneous effects of epileptiform activity with an
             interpretability-centred approach.<h4>Methods</h4>We did a
             retrospective, cross-sectional study of patients in the
             intensive care unit who were admitted to Massachusetts
             General Hospital (Boston, MA, USA). Participants were aged
             18 years or older and had electrographic epileptiform
             activity identified by a clinical neurophysiologist or
             epileptologist. The outcome was the dichotomised modified
             Rankin Scale (mRS) at discharge and the exposure was
             epileptiform activity burden defined as mean or maximum
             proportion of time spent with epileptiform activity in 6 h
             windows in the first 24 h of electroencephalography. We
             estimated the change in discharge mRS if everyone in the
             dataset had experienced a specific epileptiform activity
             burden and were untreated. We combined pharmacological
             modelling with an interpretable matching method to account
             for confounding and epileptiform activity-antiseizure
             medication feedback. The quality of the matched groups was
             validated by the neurologists.<h4>Findings</h4>Between Dec
             1, 2011, and Oct 14, 2017, 1514 patients were admitted to
             Massachusetts General Hospital intensive care unit, 995
             (66%) of whom were included in the analysis. Compared with
             patients with a maximum epileptiform activity of 0 to less
             than 25%, patients with a maximum epileptiform activity
             burden of 75% or more when untreated had a mean 22·27% (SD
             0·92) increased chance of a poor outcome (severe disability
             or death). Moderate but long-lasting epileptiform activity
             (mean epileptiform activity burden 2% to <10%) increased the
             risk of a poor outcome by mean 13·52% (SD 1·93). The
             effect sizes were heterogeneous depending on preadmission
             profile-eg, patients with hypoxic-ischaemic encephalopathy
             or acquired brain injury were more adversely affected
             compared with patients without these conditions.<h4>Interpretation</h4>Our
             results suggest that interventions should put a higher
             priority on patients with an average epileptiform activity
             burden 10% or greater, and treatment should be more
             conservative when maximum epileptiform activity burden is
             low. Treatment should also be tailored to individual
             preadmission profiles because the potential for epileptiform
             activity to cause harm depends on age, medical history, and
             reason for admission.<h4>Funding</h4>National Institutes of
             Health and National Science Foundation.},
   Doi = {10.1016/s2589-7500(23)00088-2},
   Key = {fds371244}
}

@article{fds371608,
   Author = {Peloquin, J and Kirillova, A and Rudin, C and Brinson, LC and Gall,
             K},
   Title = {Prediction of tensile performance for 3D printed
             photopolymer gyroid lattices using structural porosity, base
             material properties, and machine learning},
   Journal = {Materials and Design},
   Volume = {232},
   Year = {2023},
   Month = {August},
   url = {http://dx.doi.org/10.1016/j.matdes.2023.112126},
   Abstract = {Advancements in additive manufacturing (AM) technology and
             three-dimensional (3D) modeling software have enabled the
             fabrication of parts with combinations of properties that
             were impossible to achieve with traditional manufacturing
             techniques. Porous designs such as truss-based and
             sheet-based lattices have gained much attention in recent
             years due to their versatility. The multitude of lattice
             design possibilities, coupled with a growing list of
             available 3D printing materials, has provided a vast range
             of 3D printable structures that can be used to achieve
             desired performance. However, the process of computationally
             or experimentally evaluating many combinations of base
             material and lattice design for a given application is
             impractical. This research proposes a framework for quickly
             predicting key mechanical properties of 3D printed gyroid
             lattices using information about the base material and
             porosity of the structure. Experimental data was gathered to
             train a simple, interpretable, and accurate kernel ridge
             regression machine learning model. The performance of the
             model was then compared to numerical simulation data and
             demonstrated similar accuracy at a fraction of the
             computation time. Ultimately, the model development serves
             as an advancement in ML-driven mechanical property
             prediction that can be used to guide extension of current
             and future models.},
   Doi = {10.1016/j.matdes.2023.112126},
   Key = {fds371608}
}

@article{fds371888,
   Author = {Peloquin, J and Kirillova, A and Mathey, E and Rudin, C and Brinson, LC and Gall, K},
   Title = {Tensile performance data of 3D printed photopolymer gyroid
             lattices.},
   Journal = {Data in brief},
   Volume = {49},
   Pages = {109396},
   Year = {2023},
   Month = {August},
   url = {http://dx.doi.org/10.1016/j.dib.2023.109396},
   Abstract = {Additive manufacturing has provided the ability to
             manufacture complex structures using a wide variety of
             materials and geometries. Structures such as triply periodic
             minimal surface (TPMS) lattices have been incorporated into
             products across many fields due to their unique combinations
             of mechanical, geometric, and physical properties. Yet, the
             near limitless possibility of combining geometry and
             material into these lattices leaves much to be discovered.
             This article provides a dataset of experimentally gathered
             tensile stress-strain curves and measured porosity values
             for 389 unique gyroid lattice structures manufactured using
             vat photopolymerization 3D printing. The lattice samples
             were printed from one of twenty different photopolymer
             materials available from either Formlabs, LOCTITE AM, or
             ETEC that range from strong and brittle to elastic and
             ductile and were printed on commercially available 3D
             printers, specifically the Formlabs Form2, Prusa SL1, and
             ETEC Envision One cDLM Mechanical. The stress-strain curves
             were recorded with an MTS Criterion C43.504 mechanical
             testing apparatus and following ASTM standards, and the void
             fraction or "porosity" of each lattice was measured using a
             calibrated scale. This data serves as a valuable resource
             for use in the development of novel printing materials and
             lattice geometries and provides insight into the influence
             of photopolymer material properties on the printability,
             geometric accuracy, and mechanical performance of 3D printed
             lattice structures. The data described in this article was
             used to train a machine learning model capable of predicting
             mechanical properties of 3D printed gyroid lattices based on
             the base mechanical properties of the printing material and
             porosity of the lattice in the research article
             [1].},
   Doi = {10.1016/j.dib.2023.109396},
   Key = {fds371888}
}

@article{fds373056,
   Author = {McDonald, SM and Augustine, EK and Lanners, Q and Rudin, C and Catherine
             Brinson, L and Becker, ML},
   Title = {Applied machine learning as a driver for polymeric
             biomaterials design.},
   Journal = {Nature communications},
   Volume = {14},
   Number = {1},
   Pages = {4838},
   Year = {2023},
   Month = {August},
   url = {http://dx.doi.org/10.1038/s41467-023-40459-8},
   Abstract = {Polymers are ubiquitous to almost every aspect of modern
             society and their use in medical products is similarly
             pervasive. Despite this, the diversity in commercial
             polymers used in medicine is stunningly low. Considerable
             time and resources have been extended over the years towards
             the development of new polymeric biomaterials which address
             unmet needs left by the current generation of medical-grade
             polymers. Machine learning (ML) presents an unprecedented
             opportunity in this field to bypass the need for
             trial-and-error synthesis, thus reducing the time and
             resources invested into new discoveries critical for
             advancing medical treatments. Current efforts pioneering
             applied ML in polymer design have employed combinatorial and
             high throughput experimental design to address data
             availability concerns. However, the lack of available and
             standardized characterization of parameters relevant to
             medicine, including degradation time and biocompatibility,
             represents a nearly insurmountable obstacle to ML-aided
             design of biomaterials. Herein, we identify a gap at the
             intersection of applied ML and biomedical polymer design,
             highlight current works at this junction more broadly and
             provide an outlook on challenges and future
             directions.},
   Doi = {10.1038/s41467-023-40459-8},
   Key = {fds373056}
}

@article{fds372436,
   Author = {Zhang, R and Xin, R and Seltzer, M and Rudin, C},
   Title = {Optimal Sparse Regression Trees},
   Journal = {Proceedings of the 37th AAAI Conference on Artificial
             Intelligence, AAAI 2023},
   Volume = {37},
   Pages = {11270-11279},
   Year = {2023},
   Month = {June},
   ISBN = {9781577358800},
   Abstract = {Regression trees are one of the oldest forms of AI models,
             and their predictions can be made without a calculator,
             which makes them broadly useful, particularly for
             high-stakes applications. Within the large literature on
             regression trees, there has been little effort towards full
             provable optimization, mainly due to the computational
             hardness of the problem. This work proposes a
             dynamic-programming-with-bounds approach to the construction
             of provably-optimal sparse regression trees. We leverage a
             novel lower bound based on an optimal solution to the
             k-Means clustering algorithm on one dimensional data. We are
             often able to find optimal sparse trees in seconds, even for
             challenging datasets that involve large numbers of samples
             and highly-correlated features.},
   Key = {fds372436}
}

@article{fds362970,
   Author = {Wang, C and Han, B and Patel, B and Rudin, C},
   Title = {In Pursuit of Interpretable, Fair and Accurate Machine
             Learning for Criminal Recidivism Prediction},
   Journal = {Journal of Quantitative Criminology},
   Volume = {39},
   Number = {2},
   Pages = {519-581},
   Year = {2023},
   Month = {June},
   url = {http://dx.doi.org/10.1007/s10940-022-09545-w},
   Abstract = {Objectives: We study interpretable recidivism prediction
             using machine learning (ML) models and analyze performance
             in terms of prediction ability, sparsity, and fairness.
             Unlike previous works, this study trains interpretable
             models that output probabilities rather than binary
             predictions, and uses quantitative fairness definitions to
             assess the models. This study also examines whether models
             can generalize across geographic locations. Methods: We
             generated black-box and interpretable ML models on two
             different criminal recidivism datasets from Florida and
             Kentucky. We compared predictive performance and fairness of
             these models against two methods that are currently used in
             the justice system to predict pretrial recidivism: the
             Arnold PSA and COMPAS. We evaluated predictive performance
             of all models on predicting six different types of crime
             over two time spans. Results: Several interpretable ML
             models can predict recidivism as well as black-box ML models
             and are more accurate than COMPAS or the Arnold PSA. These
             models are potentially useful in practice. Similar to the
             Arnold PSA, some of these interpretable models can be
             written down as a simple table. Others can be displayed
             using a set of visualizations. Our geographic analysis
             indicates that ML models should be trained separately for
             separate locations and updated over time. We also present a
             fairness analysis for the interpretable models. Conclusions:
             Interpretable ML models can perform just as well as
             non-interpretable methods and currently-used risk assessment
             scales, in terms of both prediction accuracy and fairness.
             ML models might be more accurate when trained separately for
             distinct locations and kept up-to-date.},
   Doi = {10.1007/s10940-022-09545-w},
   Key = {fds362970}
}

@article{fds371245,
   Author = {Ou, YJ and Barnett, AJ and Mitra, A and Schwartz, FR and Chen, C and Grimm,
             L and Lo, JY and Rudin, C},
   Title = {A user interface to communicate interpretable AI decisions
             to radiologists},
   Journal = {Progress in Biomedical Optics and Imaging - Proceedings of
             SPIE},
   Volume = {12467},
   Year = {2023},
   Month = {January},
   ISBN = {9781510660397},
   url = {http://dx.doi.org/10.1117/12.2654068},
   Abstract = {Tools for computer-aided diagnosis based on deep learning
             have become increasingly important in the medical field.
             Such tools can be useful, but require effective
             communication of their decision-making process in order to
             safely and meaningfully guide clinical decisions. Inherently
             interpretable models provide an explanation for each
             decision that matches their internal decision-making
             process. We present a user interface that incorporates the
             Interpretable AI Algorithm for Breast Lesions (IAIA-BL)
             model, which interpretably predicts both mass margin and
             malignancy for breast lesions. The user interface displays
             the most relevant aspects of the model's explanation
             including the predicted margin value, the AI confidence in
             the prediction, and the two most highly activated prototypes
             for each case. In addition, this user interface includes
             full-field and cropped images of the region of interest, as
             well as a questionnaire suitable for a reader study. Our
             preliminary results indicate that the model increases the
             readers' confidence and accuracy in their decisions on
             margin and malignancy.},
   Doi = {10.1117/12.2654068},
   Key = {fds371245}
}

@article{fds372810,
   Author = {Lanners, Q and Parikh, H and Volfovsky, A and Rudin, C and Page,
             D},
   Title = {Variable Importance Matching for Causal Inference},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {216},
   Pages = {1174-1184},
   Year = {2023},
   Month = {January},
   Abstract = {Our goal is to produce methods for observational causal
             inference that are auditable, easy to troubleshoot, accurate
             for treatment effect estimation, and scalable to
             high-dimensional data. We describe a general framework
             called Model-to-Match that achieves these goals by (i)
             learning a distance metric via outcome modeling, (ii)
             creating matched groups using the distance metric, and (iii)
             using the matched groups to estimate treatment effects.
             Model-to-Match uses variable importance measurements to
             construct a distance metric, making it a flexible framework
             that can be adapted to various applications. Concentrating
             on the scalability of the problem in the number of potential
             confounders, we operationalize the Model-to-Match framework
             with LASSO. We derive performance guarantees for settings
             where LASSO outcome modeling consistently identifies all
             confounders (importantly without requiring the linear model
             to be correctly specified). We also provide experimental
             results demonstrating the method's auditability, accuracy,
             and scalability as well as extensions to more general
             nonparametric outcome modeling.},
   Key = {fds372810}
}

@article{fds373003,
   Author = {Agnew, E and Qiu, M and Zhu, L and Wiseman, S and Rudin,
             C},
   Title = {The Mechanical Bard: An Interpretable Machine Learning
             Approach to Shakespearean Sonnet Generation},
   Journal = {Proceedings of the Annual Meeting of the Association for
             Computational Linguistics},
   Volume = {2},
   Pages = {1627-1638},
   Year = {2023},
   Month = {January},
   ISBN = {9781959429715},
   Abstract = {We consider the automated generation of sonnets, a poetic
             form constrained according to meter, rhyme scheme, and
             length. Sonnets generally also use rhetorical figures,
             expressive language, and a consistent theme or narrative.
             Our constrained decoding approach allows for the generation
             of sonnets within preset poetic constraints, while using a
             relatively modest neural backbone. Human evaluation confirms
             that our approach produces Shakespearean sonnets that
             resemble human-authored sonnets, and which adhere to the
             genre’s defined constraints and contain lyrical language
             and literary devices.},
   Key = {fds373003}
}

@article{fds373531,
   Author = {Chen, Z and Tan, S and Chajewska, U and Rudin, C and Caruana,
             R},
   Title = {Missing Values and Imputation in Healthcare Data: Can
             Interpretable Machine Learning Help?},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {209},
   Pages = {86-99},
   Year = {2023},
   Month = {January},
   Abstract = {Missing values are a fundamental problem in data science.
             Many datasets have missing values that must be properly
             handled because the way missing values are treated can have
             large impact on the resulting machine learning model. In
             medical applications, the consequences may affect healthcare
             decisions. There are many methods in the literature for
             dealing with missing values, including state-of-the-art
             methods which often depend on black-box models for
             imputation. In this work, we show how recent advances in
             interpretable machine learning provide a new perspective for
             understanding and tackling the missing value problem. We
             propose methods based on high-accuracy glass-box Explainable
             Boosting Machines (EBMs) that can help users (1) gain new
             insights on missingness mechanisms and better understand the
             causes of missingness, and (2) detect – or even alleviate
             – potential risks introduced by imputation algorithms.
             Experiments on real-world medical datasets illustrate the
             effectiveness of the proposed methods.},
   Key = {fds373531}
}

@article{fds368049,
   Author = {Garrett, BL and Rudin, C},
   Title = {The Right to a Glass Box: Rethinking the Use of Artificial
             Intelligence in Criminal Justice},
   Journal = {Cornell Law Review},
   Year = {2023},
   Key = {fds368049}
}

@article{fds367642,
   Author = {Rudin, C},
   Title = {Why black box machine learning should be avoided for
             high-stakes decisions, in brief},
   Journal = {Nature Reviews Methods Primers},
   Volume = {2},
   Number = {1},
   Year = {2022},
   Month = {December},
   url = {http://dx.doi.org/10.1038/s43586-022-00172-0},
   Doi = {10.1038/s43586-022-00172-0},
   Key = {fds367642}
}

@article{fds368343,
   Author = {Chen, Z and Ogren, A and Daraio, C and Brinson, LC and Rudin,
             C},
   Title = {How to see hidden patterns in metamaterials with
             interpretable machine learning},
   Journal = {Extreme Mechanics Letters},
   Volume = {57},
   Year = {2022},
   Month = {November},
   url = {http://dx.doi.org/10.1016/j.eml.2022.101895},
   Abstract = {Machine learning models can assist with metamaterials design
             by approximating computationally expensive simulators or
             solving inverse design problems. However, past work has
             usually relied on black box deep neural networks, whose
             reasoning processes are opaque and require enormous datasets
             that are expensive to obtain. In this work, we develop two
             novel machine learning approaches to metamaterials discovery
             that have neither of these disadvantages. These approaches,
             called shape-frequency features and unit-cell templates, can
             discover 2D metamaterials with user-specified frequency band
             gaps. Our approaches provide logical rule-based conditions
             on metamaterial unit-cells that allow for interpretable
             reasoning processes, and generalize well across design
             spaces of different resolutions. The templates also provide
             design flexibility where users can almost freely design the
             fine resolution features of a unit-cell without affecting
             the user's desired band gap.},
   Doi = {10.1016/j.eml.2022.101895},
   Key = {fds368343}
}

@article{fds369058,
   Author = {Behrouz, A and Lécuyer, M and Rudin, C and Seltzer,
             M},
   Title = {Fast Optimization of Weighted Sparse Decision Trees for use
             in Optimal Treatment Regimes and Optimal Policy
             Design.},
   Journal = {CEUR workshop proceedings},
   Volume = {3318},
   Pages = {26},
   Year = {2022},
   Month = {October},
   Abstract = {Sparse decision trees are one of the most common forms of
             interpretable models. While recent advances have produced
             algorithms that fully optimize sparse decision trees for
             <i>prediction</i>, that work does not address <i>policy
             design</i>, because the algorithms cannot handle weighted
             data samples. Specifically, they rely on the discreteness of
             the loss function, which means that real-valued weights
             cannot be directly used. For example, none of the existing
             techniques produce policies that incorporate inverse
             propensity weighting on individual data points. We present
             three algorithms for efficient sparse weighted decision tree
             optimization. The first approach directly optimizes the
             weighted loss function; however, it tends to be
             computationally inefficient for large datasets. Our second
             approach, which scales more efficiently, transforms weights
             to integer values and uses data duplication to transform the
             weighted decision tree optimization problem into an
             unweighted (but larger) counterpart. Our third algorithm,
             which scales to much larger datasets, uses a randomized
             procedure that samples each data point with a probability
             proportional to its weight. We present theoretical bounds on
             the error of the two fast methods and show experimentally
             that these methods can be two orders of magnitude faster
             than the direct optimization of the weighted loss, without
             losing significant accuracy.},
   Key = {fds369058}
}

@article{fds369818,
   Author = {Parikh, H and Rudin, C and Volfovsky, A},
   Title = {MALTS: Matching After Learning to Stretch},
   Journal = {Journal of Machine Learning Research},
   Volume = {23},
   Year = {2022},
   Month = {August},
   Abstract = {We introduce a flexible framework that produces high-quality
             almost-exact matches for causal inference. Most prior work
             in matching uses ad-hoc distance metrics, often leading to
             poor quality matches, particularly when there are irrelevant
             covariates. In this work, we learn an interpretable distance
             metric for matching, which leads to substantially higher
             quality matches. The learned distance metric stretches the
             covariate space according to each covariate’s contribution
             to outcome prediction: this stretching means that mismatches
             on important covariates carry a larger penalty than
             mismatches on irrelevant covariates. Our ability to learn
             flexible distance metrics leads to matches that are
             interpretable and useful for the estimation of conditional
             average treatment effects.},
   Key = {fds369818}
}

@article{fds363364,
   Author = {Afnan, M and Afnan, MAM and Liu, Y and Savulescu, J and Mishra, A and Conitzer, V and Rudin, C},
   Title = {Data solidarity for machine learning for embryo selection: a
             call for the creation of an open access repository of embryo
             data.},
   Journal = {Reproductive biomedicine online},
   Volume = {45},
   Number = {1},
   Pages = {10-13},
   Year = {2022},
   Month = {July},
   url = {http://dx.doi.org/10.1016/j.rbmo.2022.03.015},
   Abstract = {The last decade has seen an explosion of machine learning
             applications in healthcare, with mixed and sometimes harmful
             results despite much promise and associated hype. A
             significant reason for the reversal in the reported benefit
             of these applications is the premature implementation of
             machine learning algorithms in clinical practice. This paper
             argues the critical need for 'data solidarity' for machine
             learning for embryo selection. A recent Lancet and Financial
             Times commission defined data solidarity as 'an approach to
             the collection, use, and sharing of health data and data for
             health that safeguards individual human rights while
             building a culture of data justice and equity, and ensuring
             that the value of data is harnessed for public good'
             (Kickbusch et al., 2021).},
   Doi = {10.1016/j.rbmo.2022.03.015},
   Key = {fds363364}
}

@article{fds364197,
   Author = {Huang, H and Wang, Y and Rudin, C and Browne, EP},
   Title = {Towards a comprehensive evaluation of dimension reduction
             methods for transcriptomic data visualization.},
   Journal = {Communications biology},
   Volume = {5},
   Number = {1},
   Pages = {719},
   Year = {2022},
   Month = {July},
   url = {http://dx.doi.org/10.1038/s42003-022-03628-x},
   Abstract = {Dimension reduction (DR) algorithms project data from high
             dimensions to lower dimensions to enable visualization of
             interesting high-dimensional structure. DR algorithms are
             widely used for analysis of single-cell transcriptomic data.
             Despite widespread use of DR algorithms such as t-SNE and
             UMAP, these algorithms have characteristics that lead to
             lack of trust: they do not preserve important aspects of
             high-dimensional structure and are sensitive to arbitrary
             user choices. Given the importance of gaining insights from
             DR, DR methods should be evaluated carefully before trusting
             their results. In this paper, we introduce and perform a
             systematic evaluation of popular DR methods, including
             t-SNE, art-SNE, UMAP, PaCMAP, TriMap and ForceAtlas2. Our
             evaluation considers five components: preservation of local
             structure, preservation of global structure, sensitivity to
             parameter choices, sensitivity to preprocessing choices, and
             computational efficiency. This evaluation can help us to
             choose DR tools that align with the scientific goals of the
             user.},
   Doi = {10.1038/s42003-022-03628-x},
   Key = {fds364197}
}

@article{fds364039,
   Author = {Semenova, L and Rudin, C and Parr, R},
   Title = {On the Existence of Simpler Machine Learning
             Models},
   Journal = {ACM International Conference Proceeding Series},
   Pages = {1827-1858},
   Year = {2022},
   Month = {June},
   ISBN = {9781450393522},
   url = {http://dx.doi.org/10.1145/3531146.3533232},
   Abstract = {It is almost always easier to find an accurate-but-complex
             model than an accurate-yet-simple model. Finding optimal,
             sparse, accurate models of various forms (linear models with
             integer coefficients, decision sets, rule lists, decision
             trees) is generally NP-hard. We often do not know whether
             the search for a simpler model will be worthwhile, and thus
             we do not go to the trouble of searching for one. In this
             work, we ask an important practical question: can
             accurate-yet-simple models be proven to exist, or shown
             likely to exist, before explicitly searching for them? We
             hypothesize that there is an important reason that
             simple-yet-accurate models often do exist. This hypothesis
             is that the size of the Rashomon set is often large, where
             the Rashomon set is the set of almost-equally-accurate
             models from a function class. If the Rashomon set is large,
             it contains numerous accurate models, and perhaps at least
             one of them is the simple model we desire. In this work, we
             formally present the Rashomon ratio as a new gauge of
             simplicity for a learning problem, depending on a function
             class and a data set. The Rashomon ratio is the ratio of the
             volume of the set of accurate models to the volume of the
             hypothesis space, and it is different from standard
             complexity measures from statistical learning theory.
             Insight from studying the Rashomon ratio provides an easy
             way to check whether a simpler model might exist for a
             problem before finding it, namely whether several different
             machine learning methods achieve similar performance on the
             data. In that sense, the Rashomon ratio is a powerful tool
             for understanding why and when an accurate-yet-simple model
             might exist. If, as we hypothesize in this work, many
             real-world data sets admit large Rashomon sets, the
             implications are vast: it means that simple or interpretable
             models may often be used for high-stakes decisions without
             losing accuracy.},
   Doi = {10.1145/3531146.3533232},
   Key = {fds364039}
}

@article{fds364272,
   Author = {Wang, T and Rudin, C},
   Title = {Causal Rule Sets for Identifying Subgroups with Enhanced
             Treatment Effects},
   Journal = {INFORMS Journal on Computing},
   Volume = {34},
   Number = {3},
   Pages = {1626-1643},
   Year = {2022},
   Month = {May},
   url = {http://dx.doi.org/10.1287/ijoc.2021.1143},
   Abstract = {A key question in causal inference analyses is how to find
             subgroups with elevated treatment effects. This paper takes
             a machine learning approach and introduces a generative
             model, causal rule sets (CRS), for interpretable subgroup
             discovery. A CRS model uses a small set of short decision
             rules to capture a subgroup in which the average treatment
             effect is elevated. We present a Bayesian framework for
             learning a causal rule set. The Bayesian model consists of a
             prior that favors simple models for better interpretability
             as well as avoiding overfitting and a Bayesian logistic
             regression that captures the likelihood of data,
             characterizing the relation between outcomes, attributes,
             and subgroup membership. The Bayesian model has tunable
             parameters that can characterize subgroups with various
             sizes, providing users with more flexible choices of models
             from the treatment- efficient frontier. We find maximum a
             posteriori models using iterative discrete Monte Carlo steps
             in the joint solution space of rules sets and parameters. To
             improve search efficiency, we provide theoretically grounded
             heuristics and bounding strategies to prune and confine the
             search space. Experiments show that the search algorithm can
             efficiently recover true underlying subgroups.We apply CRS
             on public and real-world data sets from domains in which
             interpretability is indispensable. We compare CRS with
             state-of-the-art rule-based subgroup discovery models.
             Results show that CRS achieves consistently competitive
             performance on data sets from various domains, represented
             by high treatmentefficient frontiers. Summary of
             Contribution: This paper is motivated by the large
             heterogeneity of treatment effect in many applications and
             the need to accurately locate subgroups for enhanced
             treatment effect. Existingmethods either rely on prior
             hypotheses to discover subgroups or greedy methods, such as
             tree-based recursive partitioning. Our method adopts a
             machine learning approach to find an optimal subgroup
             learned with a carefully global objective. Our model is more
             flexible in capturing subgroups by using a set of short
             decision rules compared with tree-based baselines.We
             evaluate ourmodel using a novel metric, treatment-efficient
             frontier, that characterizes the trade-off between the
             subgroup size and achievable treatment effect, and ourmodel
             demonstrates better performance than baselinemodels.
             Copyright:},
   Doi = {10.1287/ijoc.2021.1143},
   Key = {fds364272}
}

@article{fds371609,
   Author = {Liu, J and Zhong, C and Seltzer, M and Rudin, C},
   Title = {Fast Sparse Classification for Generalized Linear and
             Additive Models.},
   Journal = {Proceedings of machine learning research},
   Volume = {151},
   Pages = {9304-9333},
   Year = {2022},
   Month = {March},
   Abstract = {We present fast classification techniques for sparse
             generalized linear and additive models. These techniques can
             handle thousands of features and thousands of observations
             in minutes, even in the presence of many highly correlated
             features. For fast sparse logistic regression, our
             computational speed-up over other best-subset search
             techniques owes to linear and quadratic surrogate cuts for
             the logistic loss that allow us to efficiently screen
             features for elimination, as well as use of a priority queue
             that favors a more uniform exploration of features. As an
             alternative to the logistic loss, we propose the exponential
             loss, which permits an analytical solution to the line
             search at each iteration. Our algorithms are generally 2 to
             5 times faster than previous approaches. They produce
             interpretable models that have accuracy comparable to black
             box models on challenging datasets.},
   Key = {fds371609}
}

@article{fds358741,
   Author = {Chen, C and Lin, K and Rudin, C and Shaposhnik, Y and Wang, S and Wang,
             T},
   Title = {A holistic approach to interpretability in financial
             lending: Models, visualizations, and summary-explanations},
   Journal = {Decision Support Systems},
   Volume = {152},
   Year = {2022},
   Month = {January},
   url = {http://dx.doi.org/10.1016/j.dss.2021.113647},
   Abstract = {Lending decisions are usually made with proprietary models
             that provide minimally acceptable explanations to users. In
             a future world without such secrecy, what decision support
             tools would one want to use for justified lending decisions?
             This question is timely, since the economy has dramatically
             shifted due to a pandemic, and a massive number of new loans
             will be necessary in the short term. We propose a framework
             for such decisions, including a globally interpretable
             machine learning model, an interactive visualization of it,
             and several types of summaries and explanations for any
             given decision. The machine learning model is a two-layer
             additive risk model, which resembles a two-layer neural
             network, but is decomposable into subscales. In this model,
             each node in the first (hidden) layer represents a
             meaningful subscale model, and all of the nonlinearities are
             transparent. Our online visualization tool allows
             exploration of this model, showing precisely how it came to
             its conclusion. We provide three types of explanations that
             are simpler than, but consistent with, the global model:
             case-based reasoning explanations that use neighboring past
             cases, a set of features that were the most important for
             the model's prediction, and summary-explanations that
             provide a customized sparse explanation for any particular
             lending decision made by the model. Our framework earned the
             FICO recognition award for the Explainable Machine Learning
             Challenge, which was the first public challenge in the
             domain of explainable machine learning.1},
   Doi = {10.1016/j.dss.2021.113647},
   Key = {fds358741}
}

@article{fds362508,
   Author = {Rudin, C and Chen, C and Chen, Z and Huang, H and Semenova, L and Zhong,
             C},
   Title = {Interpretable machine learning: Fundamental principles and
             10 grand challenges},
   Journal = {Statistics Surveys},
   Volume = {16},
   Pages = {1-85},
   Year = {2022},
   Month = {January},
   url = {http://dx.doi.org/10.1214/21-SS133},
   Abstract = {Interpretability in machine learning (ML) is crucial for
             high stakes decisions and troubleshooting. In this work, we
             provide fundamental principles for interpretable ML, and
             dispel common misunderstandings that dilute the importance
             of this crucial topic. We also identify 10 technical
             challenge areas in interpretable machine learning and
             provide history and background on each problem. Some of
             these problems are classically important, and some are
             recent problems that have arisen in the last few years.
             These problems are: (1) Optimizing sparse logical models
             such as decision trees; (2) Optimization of scoring systems;
             (3) Placing constraints into generalized additive models to
             encourage sparsity and better interpretability; (4) Modern
             case-based reasoning, including neural networks and matching
             for causal inference; (5) Complete supervised
             disentanglement of neural networks; (6) Complete or even
             partial unsupervised disentanglement of neural networks; (7)
             Dimensionality reduction for data visualization; (8) Machine
             learning models that can incorporate physics and other
             generative or causal constraints; (9) Characterization of
             the “Rashomon set” of good models; and (10)
             Interpretable reinforcement learning. This survey is
             suitable as a starting point for statisticians and computer
             scientists interested in working in interpretable machine
             learning.},
   Doi = {10.1214/21-SS133},
   Key = {fds362508}
}

@article{fds363411,
   Author = {Li, C and Rudin, C and McCormick, TH},
   Title = {Rethinking Nonlinear Instrumental Variable Models through
             Prediction Validity},
   Journal = {Journal of Machine Learning Research},
   Volume = {23},
   Year = {2022},
   Month = {January},
   Abstract = {Instrumental variables (IV) are widely used in the social
             and health sciences in situations where a researcher would
             like to measure a causal effect but cannot perform an
             experiment. For valid causal inference in an IV model, there
             must be external (exogenous) variation that (i) has a
             suffciently large impact on the variable of interest (called
             the relevance as- sumption) and where (ii) the only pathway
             through which the external variation impacts the outcome is
             via the variable of interest (called the exclusion
             restriction). For statistical inference, researchers must
             also make assumptions about the functional form of the
             relationship between the three variables. Current practice
             assumes (i) and (ii) are met, then postulates a functional
             form with limited input from the data. In this paper, we
             describe a framework that leverages machine learning to
             validate these typically unchecked but consequential
             assumptions in the IV framework, providing the researcher
             empirical evidence about the quality of the instrument given
             the data at hand. Central to the proposed approach is the
             idea of prediction validity. Prediction validity checks that
             error terms { which should be independent from the
             instrument { cannot be modeled with machine learning any
             better than a model that is identically zero. We use
             prediction validity to develop both one-stage and two-stage
             approaches for IV, and demonstrate their performance on an
             example relevant to climate change policy.},
   Key = {fds363411}
}

@article{fds369682,
   Author = {McTavish, H and Zhong, C and Achermann, R and Karimalis, I and Chen, J and Rudin, C and Seltzer, M},
   Title = {Fast Sparse Decision Tree Optimization via Reference
             Ensembles.},
   Journal = {Proceedings of the ... AAAI Conference on Artificial
             Intelligence. AAAI Conference on Artificial
             Intelligence},
   Volume = {36},
   Number = {9},
   Pages = {9604-9613},
   Year = {2022},
   Month = {January},
   url = {http://dx.doi.org/10.1609/aaai.v36i9.21194},
   Abstract = {Sparse decision tree optimization has been one of the most
             fundamental problems in AI since its inception and is a
             challenge at the core of interpretable machine learning.
             Sparse decision tree optimization is computationally hard,
             and despite steady effort since the 1960's, breakthroughs
             have been made on the problem only within the past few
             years, primarily on the problem of finding optimal sparse
             decision trees. However, current state-of-the-art algorithms
             often require impractical amounts of computation time and
             memory to find optimal or near-optimal trees for some
             real-world datasets, particularly those having several
             continuous-valued features. Given that the search spaces of
             these decision tree optimization problems are massive, can
             we practically hope to find a sparse decision tree that
             competes in accuracy with a black box machine learning
             model? We address this problem via smart guessing strategies
             that can be applied to any optimal branch-and-bound-based
             decision tree algorithm. The guesses come from knowledge
             gleaned from black box models. We show that by using these
             guesses, we can reduce the run time by multiple orders of
             magnitude while providing bounds on how far the resulting
             trees can deviate from the black box's accuracy and
             expressive power. Our approach enables guesses about how to
             bin continuous features, the size of the tree, and lower
             bounds on the error for the optimal decision tree. Our
             experiments show that in many cases we can rapidly construct
             sparse decision trees that match the accuracy of black box
             models. To summarize: <i>when you are having trouble
             optimizing</i>, <i>just guess</i>.},
   Doi = {10.1609/aaai.v36i9.21194},
   Key = {fds369682}
}

@article{fds363833,
   Author = {Barnett, AJ and Sharma, V and Gajjar, N and Fang, J and Schwartz, FR and Chen, C and Lo, JY and Rudin, C},
   Title = {Interpretable Deep Learning Models for Better Clinician-AI
             Communication in Clinical Mammography},
   Journal = {Progress in Biomedical Optics and Imaging - Proceedings of
             SPIE},
   Volume = {12035},
   Year = {2022},
   Month = {January},
   ISBN = {9781510649453},
   url = {http://dx.doi.org/10.1117/12.2612372},
   Abstract = {There is increasing interest in using deep learning and
             computer vision to help guide clinical decisions, such as
             whether to order a biopsy based on a mammogram. Existing
             networks are typically black box, unable to explain how they
             make their predictions. We present an interpretable
             deep-learning network which explains its predictions in
             terms of BI-RADS features mass shape and mass margin. Our
             model predicts mass margin and mass shape, then uses the
             logits from those interpretable models to predict
             malignancy, also using an interpretable model. The
             interpretable mass margin model explains its predictions
             using a prototypical parts model. The interpretable mass
             shape model predicts segmentations, fits an ellipse, then
             determines shape based on the goodness of fit and
             eccentricity of the fitted ellipse. While including mass
             shape logits in the malignancy prediction model did not
             improve performance, we present this technique as part of a
             framework for better clinician-AI communication. 2022
             SPIE.},
   Doi = {10.1117/12.2612372},
   Key = {fds363833}
}

@article{fds368882,
   Author = {Wang, ZJ and Zhong, C and Xin, R and Takagi, T and Chen, Z and Chau, DH and Rudin, C and Seltzer, M},
   Title = {TimberTrek: Exploring and Curating Sparse Decision Trees
             with Interactive Visualization},
   Journal = {Proceedings - 2022 IEEE Visualization Conference - Short
             Papers, VIS 2022},
   Pages = {60-64},
   Year = {2022},
   Month = {January},
   ISBN = {9781665488129},
   url = {http://dx.doi.org/10.1109/VIS54862.2022.00021},
   Abstract = {Given thousands of equally accurate machine learning (ML)
             models, how can users choose among them? A recent ML
             technique enables domain experts and data scientists to
             generate a complete Rashomon set for sparse decision trees-a
             huge set of almost-optimal inter-pretable ML models. To help
             ML practitioners identify models with desirable properties
             from this Rashomon set, we develop Tim-bertrek, the first
             interactive visualization system that summarizes thousands
             of sparse decision trees at scale. Two usage scenarios
             high-light how Timbertrek can empower users to easily
             explore, compare, and curate models that align with their
             domain knowledge and values. Our open-source tool runs
             directly in users' computational notebooks and web browsers,
             lowering the barrier to creating more responsible ML models.
             Timbertrek is available at the following public demo link:
             https: //poloclub. github. io/timbertrek.},
   Doi = {10.1109/VIS54862.2022.00021},
   Key = {fds368882}
}

@article{fds371610,
   Author = {Liu, J and Zhong, C and Li, B and Seltzer, M and Rudin,
             C},
   Title = {FasterRisk: Fast and Accurate Interpretable Risk
             Scores},
   Journal = {Advances in Neural Information Processing
             Systems},
   Volume = {35},
   Year = {2022},
   Month = {January},
   ISBN = {9781713871088},
   Abstract = {Over the last century, risk scores have been the most
             popular form of predictive model used in healthcare and
             criminal justice. Risk scores are sparse linear models with
             integer coefficients; often these models can be memorized or
             placed on an index card. Typically, risk scores have been
             created either without data or by rounding logistic
             regression coefficients, but these methods do not reliably
             produce high-quality risk scores. Recent work used
             mathematical programming, which is computationally slow. We
             introduce an approach for efficiently producing a collection
             of high-quality risk scores learned from data. Specifically,
             our approach produces a pool of almost-optimal sparse
             continuous solutions, each with a different support set,
             using a beam-search algorithm. Each of these continuous
             solutions is transformed into a separate risk score through
             a “star ray” search, where a range of multipliers are
             considered before rounding the coefficients sequentially to
             maintain low logistic loss. Our algorithm returns all of
             these high-quality risk scores for the user to consider.
             This method completes within minutes and can be valuable in
             a broad variety of applications.},
   Key = {fds371610}
}

@article{fds371611,
   Author = {Lobo, E and Singh, H and Petrik, M and Rudin, C and Lakkaraju,
             H},
   Title = {Data Poisoning Attacks on Off-Policy Policy Evaluation
             Methods},
   Journal = {Proceedings of the 38th Conference on Uncertainty in
             Artificial Intelligence, UAI 2022},
   Pages = {1264-1274},
   Year = {2022},
   Month = {January},
   ISBN = {9781713863298},
   Abstract = {Off-policy Evaluation (OPE) methods are a crucial tool for
             evaluating policies in high-stakes domains such as
             healthcare, where exploration is often infeasible,
             unethical, or expensive. However, the extent to which such
             methods can be trusted under adversarial threats to data
             quality is largely unexplored. In this work, we make the
             first attempt at investigating the sensitivity of OPE
             methods to marginal adversarial perturbations to the data.
             We design a generic data poisoning attack framework
             leveraging influence functions from robust statistics to
             carefully construct perturbations that maximize error in the
             policy value estimates. We carry out extensive
             experimentation with multiple healthcare and control
             datasets. Our results demonstrate that many existing OPE
             methods are highly prone to generating value estimates with
             large errors when subject to data poisoning attacks, even
             for small adversarial perturbations. These findings question
             the reliability of policy values derived using OPE methods
             and motivate the need for developing OPE methods that are
             statistically robust to train-time data poisoning
             attacks.},
   Key = {fds371611}
}

@article{fds371612,
   Author = {Xin, R and Zhong, C and Chen, Z and Takagi, T and Seltzer, M and Rudin,
             C},
   Title = {Exploring the Whole Rashomon Set of Sparse Decision
             Trees.},
   Journal = {Advances in neural information processing
             systems},
   Volume = {35},
   Pages = {14071-14084},
   Year = {2022},
   Month = {January},
   ISBN = {9781713871088},
   Abstract = {In any given machine learning problem, there might be many
             models that explain the data almost equally well. However,
             most learning algorithms return only one of these models,
             leaving practitioners with no practical way to explore
             alternative models that might have desirable properties
             beyond what could be expressed by a loss function. The
             <i>Rashomon set</i> is the set of these all almost-optimal
             models. Rashomon sets can be large in size and complicated
             in structure, particularly for highly nonlinear function
             classes that allow complex interaction terms, such as
             decision trees. We provide the first technique for
             completely enumerating the Rashomon set for sparse decision
             trees; in fact, our work provides the first complete
             enumeration of any Rashomon set for a non-trivial problem
             with a highly nonlinear discrete function class. This allows
             the user an unprecedented level of control over model choice
             among all models that are approximately equally good. We
             represent the Rashomon set in a specialized data structure
             that supports efficient querying and sampling. We show three
             applications of the Rashomon set: 1) it can be used to study
             variable importance for the set of almost-optimal trees (as
             opposed to a single tree), 2) the Rashomon set for accuracy
             enables enumeration of the Rashomon sets for balanced
             accuracy and F1-score, and 3) the Rashomon set for a full
             dataset can be used to produce Rashomon sets constructed
             with only subsets of the data set. Thus, we are able to
             examine Rashomon sets across problems with a new lens,
             enabling users to choose models rather than be at the mercy
             of an algorithm that produces only a single
             model.},
   Key = {fds371612}
}

@article{fds373004,
   Author = {Lobo, E and Singh, H and Petrik, M and Rudin, C and Lakkaraju,
             H},
   Title = {Data Poisoning Attacks on Off-Policy Policy Evaluation
             Methods},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {180},
   Pages = {1264-1274},
   Year = {2022},
   Month = {January},
   Abstract = {Off-policy Evaluation (OPE) methods are a crucial tool for
             evaluating policies in high-stakes domains such as
             healthcare, where exploration is often infeasible,
             unethical, or expensive. However, the extent to which such
             methods can be trusted under adversarial threats to data
             quality is largely unexplored. In this work, we make the
             first attempt at investigating the sensitivity of OPE
             methods to marginal adversarial perturbations to the data.
             We design a generic data poisoning attack framework
             leveraging influence functions from robust statistics to
             carefully construct perturbations that maximize error in the
             policy value estimates. We carry out extensive
             experimentation with multiple healthcare and control
             datasets. Our results demonstrate that many existing OPE
             methods are highly prone to generating value estimates with
             large errors when subject to data poisoning attacks, even
             for small adversarial perturbations. These findings question
             the reliability of policy values derived using OPE methods
             and motivate the need for developing OPE methods that are
             statistically robust to train-time data poisoning
             attacks.},
   Key = {fds373004}
}

@article{fds359953,
   Author = {Guo, Z and Ding, C and Hu, X and Rudin, C},
   Title = {A supervised machine learning semantic segmentation approach
             for detecting artifacts in plethysmography signals from
             wearables.},
   Journal = {Physiological measurement},
   Volume = {42},
   Number = {12},
   Year = {2021},
   Month = {December},
   url = {http://dx.doi.org/10.1088/1361-6579/ac3b3d},
   Abstract = {<i>Objective</i>. Wearable devices equipped with
             plethysmography (PPG) sensors provided a low-cost, long-term
             solution to early diagnosis and continuous screening of
             heart conditions. However PPG signals collected from such
             devices often suffer from corruption caused by artifacts.
             The objective of this study is to develop an effective
             supervised algorithm to locate the regions of artifacts
             within PPG signals.<i>Approach</i>. We treat artifact
             detection as a 1D segmentation problem. We solve it via a
             novel combination of an active-contour-based loss and an
             adapted U-Net architecture. The proposed algorithm was
             trained on the PPG DaLiA training set, and further evaluated
             on the PPG DaLiA testing set, WESAD dataset and TROIKA
             dataset.<i>Main results</i>. We evaluated with the DICE
             score, a well-established metric for segmentation accuracy
             evaluation in the field of computer vision. The proposed
             method outperforms baseline methods on all three datasets by
             a large margin (≈7 percentage points above the next best
             method). On the PPG DaLiA testing set, WESAD dataset and
             TROIKA dataset, the proposed method achieved 0.8734 ±
             0.0018, 0.9114 ± 0.0033 and 0.8050 ± 0.0116 respectively.
             The next best method only achieved 0.8068 ± 0.0014, 0.8446
             ± 0.0013 and 0.7247 ± 0.0050.<i>Significance</i>. The
             proposed method is able to pinpoint exact locations of
             artifacts with high precision; in the past, we had only a
             binary classification of whether a PPG signal has good or
             poor quality. This more nuanced information will be critical
             to further inform the design of algorithms to detect cardiac
             arrhythmia.},
   Doi = {10.1088/1361-6579/ac3b3d},
   Key = {fds359953}
}

@article{fds361182,
   Author = {Barnett, AJ and Schwartz, FR and Tao, C and Chen, C and Ren, Y and Lo, JY and Rudin, C},
   Title = {A case-based interpretable deep learning model for
             classification of mass lesions in digital
             mammography},
   Journal = {Nature Machine Intelligence},
   Volume = {3},
   Number = {12},
   Pages = {1061-1070},
   Year = {2021},
   Month = {December},
   url = {http://dx.doi.org/10.1038/s42256-021-00423-x},
   Abstract = {Interpretability in machine learning models is important in
             high-stakes decisions such as whether to order a biopsy
             based on a mammographic exam. Mammography poses important
             challenges that are not present in other computer vision
             tasks: datasets are small, confounding information is
             present and it can be difficult even for a radiologist to
             decide between watchful waiting and biopsy based on a
             mammogram alone. In this work we present a framework for
             interpretable machine learning-based mammography. In
             addition to predicting whether a lesion is malignant or
             benign, our work aims to follow the reasoning processes of
             radiologists in detecting clinically relevant semantic
             features of each image, such as the characteristics of the
             mass margins. The framework includes a novel interpretable
             neural network algorithm that uses case-based reasoning for
             mammography. Our algorithm can incorporate a combination of
             data with whole image labelling and data with pixel-wise
             annotations, leading to better accuracy and interpretability
             even with a small number of images. Our interpretable models
             are able to highlight the classification-relevant parts of
             the image, whereas other methods highlight healthy tissue
             and confounding information. Our models are decision
             aids—rather than decision makers—and aim for better
             overall human–machine collaboration. We do not observe a
             loss in mass margin classification accuracy over a black box
             neural network trained on the same data.},
   Doi = {10.1038/s42256-021-00423-x},
   Key = {fds361182}
}

@article{fds359613,
   Author = {Coker, B and Rudin, C and King, G},
   Title = {A Theory of Statistical Inference for Ensuring the
             Robustness of Scientific Results},
   Journal = {Management Science},
   Volume = {67},
   Number = {10},
   Pages = {6174-6197},
   Year = {2021},
   Month = {October},
   url = {http://dx.doi.org/10.1287/mnsc.2020.3818},
   Abstract = {Inference is the process of using facts we know to learn
             about facts we do not know. A theory of inference gives
             assumptions necessary to get from the former to the latter,
             along with a definition for and summary of the resulting
             uncertainty. Any one theory of inference is neither right
             nor wrong but merely an axiom that may or may not be useful.
             Each of the many diverse theories of inference can be
             valuable for certain applications. However, no existing
             theory of inference addresses the tendency to choose, from
             the range of plausible data analysis specifications
             consistent with prior evidence, those that inadvertently
             favor one’s own hypotheses. Because the biases from these
             choices are a growing concern across scientific fields, and
             in a sense the reason the scientific community was invented
             in the first place, we introduce a new theory of inference
             designed to address this critical problem. We introduce
             hacking intervals, which are the range of a summary
             statistic one may obtain given a class of possible
             endogenous manipulations of the data. Hacking intervals
             require no appeal to hypothetical data sets drawn from
             imaginary superpopulations. A scientific result with a small
             hacking interval is more robust to researcher manipulation
             than one with a larger interval and is often easier to
             interpret than a classical confidence interval. Some
             versions of hacking intervals turn out to be equivalent to
             classical confidence intervals, which means they may also
             provide a more intuitive and potentially more useful
             interpretation of classical confidence intervals.},
   Doi = {10.1287/mnsc.2020.3818},
   Key = {fds359613}
}

@article{fds358740,
   Author = {Afnan, MAM and Rudin, C and Conitzer, V and Savulescu, J and Mishra, A and Liu, Y and Afnan, M},
   Title = {Ethical Implementation of Artificial Intelligence to Select
             Embryos in in Vitro Fertilization},
   Journal = {AIES 2021 - Proceedings of the 2021 AAAI/ACM Conference on
             AI, Ethics, and Society},
   Pages = {316-326},
   Year = {2021},
   Month = {July},
   ISBN = {9781450384735},
   url = {http://dx.doi.org/10.1145/3461702.3462589},
   Abstract = {AI has the potential to revolutionize many areas of
             healthcare. Radiology, dermatology, and ophthalmology are
             some of the areas most likely to be impacted in the near
             future, and they have received significant attention from
             the broader research community. But AI techniques are now
             also starting to be used in in vitro fertilization (IVF), in
             particular for selecting which embryos to transfer to the
             woman. The contribution of AI to IVF is potentially
             significant, but must be done carefully and transparently,
             as the ethical issues are significant, in part because this
             field involves creating new people. We first give a brief
             introduction to IVF and review the use of AI for embryo
             selection. We discuss concerns with the interpretation of
             the reported results from scientific and practical
             perspectives. We then consider the broader ethical issues
             involved. We discuss in detail the problems that result from
             the use of black-box methods in this context and advocate
             strongly for the use of interpretable models. Importantly,
             there have been no published trials of clinical
             effectiveness, a problem in both the AI and IVF communities,
             and we therefore argue that clinical implementation at this
             point would be premature. Finally, we discuss ways for the
             broader AI community to become involved to ensure
             scientifically sound and ethically responsible development
             of AI in IVF.},
   Doi = {10.1145/3461702.3462589},
   Key = {fds358740}
}

@article{fds359994,
   Author = {Wang, J and Zhang, X and Zhou, Y and Suh, C and Rudin,
             C},
   Title = {There once was a really bad poet, it was automated but you
             didn’t know it},
   Journal = {Transactions of the Association for Computational
             Linguistics},
   Volume = {9},
   Pages = {605-620},
   Year = {2021},
   Month = {July},
   url = {http://dx.doi.org/10.1162/tacl_a_00387},
   Abstract = {Limerick generation exemplifies some of the most difficult
             challenges faced in poetry generation, as the poems must
             tell a story in only five lines, with constraints on rhyme,
             stress, and meter. To address these challenges, we introduce
             LimGen, a novel and fully automated system for limerick
             generation that outperforms state-of-the-art neural
             network-based poetry models, as well as prior rule-based
             poetry models. LimGen consists of three important pieces:
             the Adaptive Multi-Templated Constraint algorithm that
             constrains our search to the space of realistic poems, the
             Multi-Templated Beam Search algorithm which searches
             efficiently through the space, and the probabilistic
             Storyline algorithm that provides coherent storylines
             related to a user-provided prompt word. The resulting
             limericks satisfy poetic constraints and have thematically
             coherent storylines, which are sometimes even funny (when we
             are lucky).},
   Doi = {10.1162/tacl_a_00387},
   Key = {fds359994}
}

@article{fds361664,
   Author = {Barnett, AJ and Schwartz, FR and Tao, C and Chen, C and Ren, Y and Lo, JY and Rudin, C},
   Title = {IAIA-BL: A Case-based Interpretable Deep Learning Model for
             Classification of Mass Lesions in Digital
             Mammography},
   Year = {2021},
   Month = {March},
   Abstract = {Interpretability in machine learning models is important in
             high-stakes decisions, such as whether to order a biopsy
             based on a mammographic exam. Mammography poses important
             challenges that are not present in other computer vision
             tasks: datasets are small, confounding information is
             present, and it can be difficult even for a radiologist to
             decide between watchful waiting and biopsy based on a
             mammogram alone. In this work, we present a framework for
             interpretable machine learning-based mammography. In
             addition to predicting whether a lesion is malignant or
             benign, our work aims to follow the reasoning processes of
             radiologists in detecting clinically relevant semantic
             features of each image, such as the characteristics of the
             mass margins. The framework includes a novel interpretable
             neural network algorithm that uses case-based reasoning for
             mammography. Our algorithm can incorporate a combination of
             data with whole image labelling and data with pixel-wise
             annotations, leading to better accuracy and interpretability
             even with a small number of images. Our interpretable models
             are able to highlight the classification-relevant parts of
             the image, whereas other methods highlight healthy tissue
             and confounding information. Our models are decision aids,
             rather than decision makers, aimed at better overall
             human-machine collaboration. We do not observe a loss in
             mass margin classification accuracy over a black box neural
             network trained on the same data.},
   Key = {fds361664}
}

@article{fds359954,
   Author = {Gupta, NR and Orlandi, V and Chang, C-R and Wang, T and Morucci, M and Dey,
             P and Howell, TJ and Sun, X and Ghosal, A and Roy, S and Rudin, C and Volfovsky, A},
   Title = {dame-flame: A Python Library Providing Fast Interpretable
             Matching for Causal Inference},
   Volume = {abs/2101.01867},
   Year = {2021},
   Month = {January},
   Abstract = {dame-flame is a Python package for performing matching for
             observational causal inference on datasets containing
             discrete covariates. This package implements the Dynamic
             Almost Matching Exactly (DAME) and Fast Large-Scale Almost
             Matching Exactly (FLAME) algorithms, which match treatment
             and control units on subsets of the covariates. The
             resulting matched groups are interpretable, because the
             matches are made on covariates, and high-quality, because
             machine learning is used to determine which covariates are
             important to match on. DAME solves an optimization problem
             that matches units on as many covariates as possible,
             prioritizing matches on important covariates. FLAME
             approximates the solution found by DAME via a much faster
             backward feature selection procedure. The package provides
             several adjustable parameters to adapt the algorithms to
             specific applications, and can calculate treatment effect
             estimates after matching. Descriptions of these parameters,
             details on estimating treatment effects, and further
             examples, can be found in the documentation at
             https://almost-matching-exactly.github.io/DAME-FLAME-Python-Package/},
   Key = {fds359954}
}

@article{fds359568,
   Author = {Wang, Y and Huang, H and Rudin, C and Shaposhnik,
             Y},
   Title = {Understanding how dimension reduction tools work: An
             empirical approach to deciphering T-SNE, UMAP, TriMap, and
             PaCMAP for data visualization},
   Journal = {Journal of Machine Learning Research},
   Volume = {22},
   Year = {2021},
   Month = {January},
   Abstract = {Dimension reduction (DR) techniques such as t-SNE, UMAP, and
             TriMap have demonstrated impressive visualization
             performance on many real-world datasets. One tension that
             has always faced these methods is the trade-off between
             preservation of global structure and preservation of local
             structure: these methods can either handle one or the other,
             but not both. In this work, our main goal is to understand
             what aspects of DR methods are important for preserving both
             local and global structure: it is difficult to design a
             better method without a true understanding of the choices we
             make in our algorithms and their empirical impact on the
             low-dimensional embeddings they produce. Towards the goal of
             local structure preservation, we provide several useful
             design principles for DR loss functions based on our new
             understanding of the mechanisms behind successful DR
             methods. Towards the goal of global structure preservation,
             our analysis illuminates that the choice of which components
             to preserve is important. We leverage these insights to
             design a new algorithm for DR, called Pairwise Controlled
             Manifold Approximation Projection (PaCMAP), which preserves
             both local and global structure. Our work provides several
             unexpected insights into what design choices both to make
             and avoid when constructing DR algorithms.},
   Key = {fds359568}
}

@article{fds346814,
   Author = {Wang, T and Morucci, M and Awan, MU and Liu, Y and Roy, S and Rudin, C and Volfovsky, A},
   Title = {FLAME: A fast large-scale almost matching exactly approach
             to causal inference},
   Journal = {Journal of Machine Learning Research},
   Volume = {22},
   Pages = {31:1-31:1},
   Year = {2021},
   Month = {January},
   Abstract = {A classical problem in causal inference is that of matching,
             where treatment units need to be matched to control units
             based on covariate information. In this work, we propose a
             method that computes high quality almost-exact matches for
             high-dimensional categorical datasets. This method, called
             FLAME (Fast Large-scale Almost Matching Exactly), learns a
             distance metric for matching using a hold-out training data
             set. In order to perform matching efficiently for large
             datasets, FLAME leverages techniques that are natural for
             query processing in the area of database management, and two
             implementations of FLAME are provided: The first uses SQL
             queries and the second uses bit-vector techniques. The
             algorithm starts by constructing matches of the highest
             quality (exact matches on all covariates), and successively
             eliminates variables in order to match exactly on as many
             variables as possible, while still maintaining interpretable
             high-quality matches and balance between treatment and
             control groups. We leverage these high quality matches to
             estimate conditional average treatment effects (CATEs). Our
             experiments show that FLAME scales to huge datasets with
             millions of observations where existing state-of-the-art
             methods fail, and that it achieves significantly better
             performance than other matching methods. &copy; 2021 Tianyu
             Wang, Marco Morucci, M. Usaid Awan, Yameng Liu, Sudeepa Roy,
             Cynthia Rudin, Alexander Volfovsky.},
   Key = {fds346814}
}

@article{fds356437,
   Author = {Traca, S and Rudin, C and Yan, W},
   Title = {Regulating greed over time in multi-armed
             bandits},
   Journal = {Journal of Machine Learning Research},
   Volume = {22},
   Year = {2021},
   Month = {January},
   Abstract = {In retail, there are predictable yet dramatic time-dependent
             patterns in customer behavior, such as periodic changes in
             the number of visitors, or increases in customers just
             before major holidays. The current paradigm of multi-armed
             bandit analysis does not take these known patterns into
             account. This means that for applications in retail, where
             prices are fixed for periods of time, current bandit
             algorithms will not suffice. This work provides a remedy
             that takes the time-dependent patterns into account, and we
             show how this remedy is implemented for the UCB,
             &epsi;-greedy, and UCB-L algorithms, and also through a new
             policy called the variable arm pool algorithm. In the
             corrected methods, exploitation (greed) is regulated over
             time, so that more exploitation occurs during higher reward
             periods, and more exploration occurs in periods of low
             reward. In order to understand why regret is reduced with
             the corrected methods, we present a set of bounds that
             provide insight into why we would want to exploit during
             periods of high reward, and discuss the impact on regret.
             Our proposed methods perform well in experiments, and were
             inspired by a high-scoring entry in the Exploration and
             Exploitation 3 contest using data from Yahoo! Front Page.
             That entry heavily used time-series methods to regulate
             greed over time, which was substantially more effective than
             other contextual bandit methods. &copy; 2021 Stefano Traca,
             Cynthia Rudin, Weiyu Yan.},
   Key = {fds356437}
}

@article{fds357549,
   Author = {Koyyalagunta, D and Sun, A and Draelos, RL and Rudin,
             C},
   Title = {Playing codenames with language graphs and word
             embeddings},
   Journal = {Journal of Artificial Intelligence Research},
   Volume = {71},
   Pages = {319-346},
   Year = {2021},
   Month = {January},
   url = {http://dx.doi.org/10.1613/jair.1.12665},
   Abstract = {Although board games and video games have been studied for
             decades in artificial intelligence research, challenging
             word games remain relatively unexplored. Word games are not
             as constrained as games like chess or poker. Instead, word
             game strategy is defined by the players' understanding of
             the way words relate to each other. The word game Codenames
             provides a unique opportunity to investigate common sense
             understanding of relationships between words, an important
             open challenge. We propose an algorithm that can generate
             Codenames clues from the language graph BabelNet or from any
             of several embedding methods - word2vec, GloVe, fastText or
             BERT. We introduce a new scoring function that measures the
             quality of clues, and we propose a weighting term called
             DETECT that incorporates dictionary-based word
             representations and document frequency to improve clue
             selection. We develop BabelNet-Word Selection Framework
             (BabelNet-WSF) to improve BabelNet clue quality and overcome
             the computational barriers that previously prevented
             leveraging language graphs for Codenames. Extensive
             experiments with human evaluators demonstrate that our
             proposed innovations yield state-of-the-art performance,
             with up to 102.8% improvement in precision@2 in some cases.
             Overall, this work advances the formal study of word games
             and approaches for common sense language
             understanding.},
   Doi = {10.1613/jair.1.12665},
   Key = {fds357549}
}

@article{fds373532,
   Author = {Afnan, MAM and Liu, Y and Conitzer, V and Rudin, C and Mishra, A and Savulescu, J and Afnan, M},
   Title = {Interpretable, not black-box, artificial intelligence should
             be used for embryo selection.},
   Journal = {Human reproduction open},
   Volume = {2021},
   Number = {4},
   Pages = {hoab040},
   Year = {2021},
   Month = {January},
   url = {http://dx.doi.org/10.1093/hropen/hoab040},
   Abstract = {Artificial intelligence (AI) techniques are starting to be
             used in IVF, in particular for selecting which embryos to
             transfer to the woman. AI has the potential to process
             complex data sets, to be better at identifying subtle but
             important patterns, and to be more objective than humans
             when evaluating embryos. However, a current review of the
             literature shows much work is still needed before AI can be
             ethically implemented for this purpose. No randomized
             controlled trials (RCTs) have been published, and the
             efficacy studies which exist demonstrate that algorithms can
             broadly differentiate well between 'good-' and 'poor-'
             quality embryos but not necessarily between embryos of
             similar quality, which is the actual clinical need. Almost
             universally, the AI models were opaque ('black-box') in that
             at least some part of the process was uninterpretable. This
             gives rise to a number of epistemic and ethical concerns,
             including problems with trust, the possibility of using
             algorithms that generalize poorly to different populations,
             adverse economic implications for IVF clinics, potential
             misrepresentation of patient values, broader societal
             implications, a responsibility gap in the case of poor
             selection choices and introduction of a more paternalistic
             decision-making process. Use of interpretable models, which
             are constrained so that a human can easily understand and
             explain them, could overcome these concerns. The
             contribution of AI to IVF is potentially significant, but we
             recommend that AI models used in this field should be
             interpretable, and rigorously evaluated with RCTs before
             implementation. We also recommend long-term follow-up of
             children born after AI for embryo selection, regulatory
             oversight for implementation, and public availability of
             data and code to enable research teams to independently
             reproduce and validate existing models.},
   Doi = {10.1093/hropen/hoab040},
   Key = {fds373532}
}

@article{fds354204,
   Author = {Chen, Z and Bei, Y and Rudin, C},
   Title = {Concept whitening for interpretable image
             recognition},
   Journal = {Nature Machine Intelligence},
   Volume = {2},
   Number = {12},
   Pages = {772-782},
   Year = {2020},
   Month = {December},
   url = {http://dx.doi.org/10.1038/s42256-020-00265-z},
   Abstract = {What does a neural network encode about a concept as we
             traverse through the layers? Interpretability in machine
             learning is undoubtedly important, but the calculations of
             neural networks are very challenging to understand. Attempts
             to see inside their hidden layers can be misleading,
             unusable or rely on the latent space to possess properties
             that it may not have. Here, rather than attempting to
             analyse a neural network post hoc, we introduce a mechanism,
             called concept whitening (CW), to alter a given layer of the
             network to allow us to better understand the computation
             leading up to that layer. When a concept whitening module is
             added to a convolutional neural network, the latent space is
             whitened (that is, decorrelated and normalized) and the axes
             of the latent space are aligned with known concepts of
             interest. By experiment, we show that CW can provide us with
             a much clearer understanding of how the network gradually
             learns concepts over layers. CW is an alternative to a batch
             normalization layer in that it normalizes, and also
             decorrelates (whitens), the latent space. CW can be used in
             any layer of the network without hurting predictive
             performance.},
   Doi = {10.1038/s42256-020-00265-z},
   Key = {fds354204}
}

@article{fds354205,
   Author = {Dong, J and Rudin, C},
   Title = {Exploring the cloud of variable importance for the set of
             all good models},
   Journal = {Nature Machine Intelligence},
   Volume = {2},
   Number = {12},
   Pages = {810-824},
   Year = {2020},
   Month = {December},
   url = {http://dx.doi.org/10.1038/s42256-020-00264-0},
   Abstract = {Variable importance is central to scientific studies,
             including the social sciences and causal inference,
             healthcare and other domains. However, current notions of
             variable importance are often tied to a specific predictive
             model. This is problematic: what if there were multiple
             well-performing predictive models, and a specific variable
             is important to some of them but not to others? In that
             case, we cannot tell from a single well-performing model if
             a variable is always important, sometimes important, never
             important or perhaps only important when another variable is
             not important. Ideally, we would like to explore variable
             importance for all approximately equally accurate predictive
             models within the same model class. In this way, we can
             understand the importance of a variable in the context of
             other variables, and for many good models. This work
             introduces the concept of a variable importance cloud, which
             maps every variable to its importance for every good
             predictive model. We show properties of the variable
             importance cloud and draw connections to other areas of
             statistics. We introduce variable importance diagrams as a
             projection of the variable importance cloud into two
             dimensions for visualization purposes. Experiments with
             criminal justice, marketing data and image classification
             tasks illustrate how variables can change dramatically in
             importance for approximately equally accurate predictive
             models.},
   Doi = {10.1038/s42256-020-00264-0},
   Key = {fds354205}
}

@article{fds359959,
   Author = {Huang, Q and Zhou, Y and Du, X and Chen, R and Wang, J and Rudin, C and Bartesaghi, A},
   Title = {Cryo-ZSSR: multiple-image super-resolution based on deep
             internal learning},
   Volume = {abs/2011.11020},
   Year = {2020},
   Month = {November},
   Abstract = {Single-particle cryo-electron microscopy (cryo-EM) is an
             emerging imaging modality capable of visualizing proteins
             and macro-molecular complexes at near-atomic resolution. The
             low electron-doses used to prevent sample radiation damage,
             result in images where the power of the noise is 100 times
             greater than the power of the signal. To overcome the
             low-SNRs, hundreds of thousands of particle projections
             acquired over several days of data collection are averaged
             in 3D to determine the structure of interest. Meanwhile,
             recent image super-resolution (SR) techniques based on
             neural networks have shown state of the art performance on
             natural images. Building on these advances, we present a
             multiple-image SR algorithm based on deep internal learning
             designed specifically to work under low-SNR conditions. Our
             approach leverages the internal image statistics of cryo-EM
             movies and does not require training on ground-truth data.
             When applied to a single-particle dataset of apoferritin, we
             show that the resolution of 3D structures obtained from SR
             micrographs can surpass the limits imposed by the imaging
             system. Our results indicate that the combination of low
             magnification imaging with image SR has the potential to
             accelerate cryo-EM data collection without sacrificing
             resolution.},
   Key = {fds359959}
}

@article{fds353866,
   Author = {Wang, T and Ye, W and Geng, D and Rudin, C},
   Title = {Towards Practical Lipschitz Bandits},
   Journal = {FODS 2020 - Proceedings of the 2020 ACM-IMS Foundations of
             Data Science Conference},
   Pages = {129-138},
   Year = {2020},
   Month = {October},
   ISBN = {9781450381031},
   url = {http://dx.doi.org/10.1145/3412815.3416885},
   Abstract = {Stochastic Lipschitz bandit algorithms balance exploration
             and exploitation, and have been used for a variety of
             important task domains. In this paper, we present a
             framework for Lipschitz bandit methods that adaptively
             learns partitions of context-and arm-space. Due to this
             flexibility, the algorithm is able to efficiently optimize
             rewards and minimize regret, by focusing on the portions of
             the space that are most relevant. In our analysis, we link
             tree-based methods to Gaussian processes. In light of our
             analysis, we design a novel hierarchical Bayesian model for
             Lipschitz bandit problems. Our experiments show that our
             algorithms can achieve state-of-the-art performance in
             challenging real-world tasks such as neural network
             hyperparameter tuning.},
   Doi = {10.1145/3412815.3416885},
   Key = {fds353866}
}

@article{fds359955,
   Author = {Rich, AS and Rudin, C and Jacoby, DMP and Freeman, R and Wearn, OR and Shevlin, H and Dihal, K and ÓhÉigeartaigh, SS and Butcher, J and Lippi, M and Palka, P and Torroni, P and Wongvibulsin, S and Begoli, E and Schneider, G and Cave, S and Sloane, M and Moss, E and Rahwan, I and Goldberg, K and Howard, D and Floridi, L and Stilgoe,
             J},
   Title = {AI reflections in 2019},
   Journal = {Nature Machine Intelligence},
   Volume = {2},
   Number = {1},
   Pages = {2-9},
   Publisher = {Springer Science and Business Media LLC},
   Year = {2020},
   Month = {January},
   url = {http://dx.doi.org/10.1038/s42256-019-0141-1},
   Doi = {10.1038/s42256-019-0141-1},
   Key = {fds359955}
}

@article{fds359958,
   Author = {Morucci, M and Orlandi, V and Rudin, C and Roy, S and Volfovsky,
             A},
   Title = {Adaptive Hyper-box Matching for Interpretable Individualized
             Treatment Effect Estimation},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {124},
   Pages = {1089-1098},
   Publisher = {AUAI Press},
   Editor = {Adams, RP and Gogate, V},
   Year = {2020},
   Month = {January},
   Abstract = {We propose a matching method for observational data that
             matches units with others in unit-specific, hyper-box-shaped
             regions of the covariate space. These regions are large
             enough that many matches are created for each unit and small
             enough that the treatment effect is roughly constant
             throughout. The regions are found as either the solution to
             a mixed integer program, or using a (fast) approximation
             algorithm. The result is an interpretable and tailored
             estimate of the causal effect for each unit.},
   Key = {fds359958}
}

@article{fds359960,
   Author = {Gregory, H and Li, S and Mohammadi, P and Tarn, N and Draelos, R and Rudin,
             C},
   Title = {A transformer approach to contextual sarcasm detection in
             twitter},
   Journal = {Proceedings of the Annual Meeting of the Association for
             Computational Linguistics},
   Pages = {270-275},
   Year = {2020},
   Month = {January},
   ISBN = {9781952148125},
   url = {http://dx.doi.org/10.18653/v1/P17},
   Abstract = {Understanding tone in Twitter posts will be increasingly
             important as more and more communication moves online. One
             of the most difficult, yet important tones to detect is
             sarcasm. In the past, LSTM and transformer architecture
             models have been used to tackle this problem. We attempt to
             expand upon this research, implementing LSTM, GRU, and
             transformer models, and exploring new methods to classify
             sarcasm in Twitter posts. Among these, the most successful
             were transformer models, most notably BERT. While we
             attempted a few other models described in this paper, our
             most successful model was an ensemble of transformer models
             including BERT, RoBERTa, XLNet, RoBERTa-large, and ALBERT.
             This research was performed in conjunction with the sarcasm
             detection shared task section in the Second Workshop on
             Figurative Language Processing, co-located with ACL
             2020.},
   Doi = {10.18653/v1/P17},
   Key = {fds359960}
}

@article{fds359957,
   Author = {Awan, MU and Morucci, M and Orlandi, V and Roy, S and Rudin, C and Volfovsky, A},
   Title = {Almost-Matching-Exactly for Treatment Effect Estimation
             under Network Interference},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {108},
   Pages = {3252-3262},
   Publisher = {PMLR},
   Editor = {Chiappa, S and Calandra, R},
   Year = {2020},
   Month = {January},
   Abstract = {We propose a matching method that recovers direct treatment
             effects from randomized experiments where units are
             connected in an observed network, and units that share edges
             can potentially influence each others' outcomes. Traditional
             treatment effect estimators for randomized experiments are
             biased and error prone in this setting. Our method matches
             units almost exactly on counts of unique subgraphs within
             their neighborhood graphs. The matches that we construct are
             interpretable and high-quality. Our method can be extended
             easily to accommodate additional unit-level covariate
             information. We show empirically that our method performs
             better than other existing methodologies for this problem,
             while producing meaningful, interpretable
             results.},
   Key = {fds359957}
}

@article{fds353047,
   Author = {Menon, S and Damian, A and Hu, S and Ravi, N and Rudin,
             C},
   Title = {PULSE: Self-Supervised Photo Upsampling via Latent Space
             Exploration of Generative Models},
   Journal = {Proceedings of the IEEE Computer Society Conference on
             Computer Vision and Pattern Recognition},
   Pages = {2434-2442},
   Year = {2020},
   Month = {January},
   url = {http://dx.doi.org/10.1109/CVPR42600.2020.00251},
   Abstract = {The primary aim of single-image super-resolution is to
             construct a high-resolution (HR) image from a corresponding
             low-resolution (LR) input. In previous approaches, which
             have generally been supervised, the training objective
             typically measures a pixel-wise average distance between the
             super-resolved (SR) and HR images. Optimizing such metrics
             often leads to blurring, especially in high variance
             (detailed) regions. We propose an alternative formulation of
             the super-resolution problem based on creating realistic SR
             images that downscale correctly. We present a novel
             super-resolution algorithm addressing this problem, PULSE
             (Photo Upsampling via Latent Space Exploration), which
             generates high-resolution, realistic images at resolutions
             previously unseen in the literature. It accomplishes this in
             an entirely self-supervised fashion and is not confined to a
             specific degradation operator used during training, unlike
             previous methods (which require training on databases of
             LR-HR image pairs for supervised learning). Instead of
             starting with the LR image and slowly adding detail, PULSE
             traverses the high-resolution natural image manifold,
             searching for images that downscale to the original LR
             image. This is formalized through the 'downscaling loss,'
             which guides exploration through the latent space of a
             generative model. By leveraging properties of
             high-dimensional Gaussians, we restrict the search space to
             guarantee that our outputs are realistic. PULSE thereby
             generates super-resolved images that both are realistic and
             downscale correctly. We show extensive experimental results
             demonstrating the efficacy of our approach in the domain of
             face super-resolution (also known as face hallucination).
             Our method outperforms state-of-the-art methods in
             perceptual quality at higher resolutions and scale factors
             than previously possible.},
   Doi = {10.1109/CVPR42600.2020.00251},
   Key = {fds353047}
}

@article{fds356438,
   Author = {Wang, T and Rudin, C},
   Title = {Bandits for bmo functions},
   Journal = {37th International Conference on Machine Learning, ICML
             2020},
   Volume = {PartF168147-13},
   Pages = {9938-9948},
   Year = {2020},
   Month = {January},
   ISBN = {9781713821120},
   Abstract = {We study the bandit problem where the underlying expected
             reward is a Bounded Mean Oscillation (BMO) function. BMO
             functions are allowed to be discontinuous and unbounded, and
             are useful in modeling signals with infinities in the
             domain. We develop a toolset for BMO bandits, and provide an
             algorithm that can achieve poly-log-regret a regret measured
             against an arm that is optimal after removing a-sized
             portion of the arm space.},
   Key = {fds356438}
}

@article{fds356439,
   Author = {Lin, J and Zhong, C and Hu, D and Rudin, C and Seltzer,
             M},
   Title = {Generalized and scalable optimal sparse decision
             trees},
   Journal = {37th International Conference on Machine Learning, ICML
             2020},
   Volume = {PartF168147-8},
   Pages = {6106-6116},
   Year = {2020},
   Month = {January},
   ISBN = {9781713821120},
   Abstract = {Decision tree optimization is notoriously difficult from a
             computational perspective but essential for the field of
             interpretable machine learning. Despite efforts over the
             past 40 years, only recently have optimization breakthroughs
             been made that have allowed practical algorithms to find
             optimal decision trees. These new techniques have the
             potential to trigger a paradigm shift where it is possible
             to construct sparse decision trees to efficiently optimize a
             variety of objective functions without relying on greedy
             splitting and pruning heuristics that often lead to
             suboptimal solutions. The contribution in this work is to
             provide a general framework for decision tree optimization
             that addresses the two significant open problems in the
             area: treatment of imbalanced data and fully optimizing over
             continuous variables. We present techniques that produce
             optimal decision trees over a variety of objectives
             including F-score, AUC, and partial area under the ROC
             convex hull. We also introduce a scalable algorithm that
             produces provably optimal results in the presence of
             continuous variables and speeds up decision tree
             construction by several orders of magnitude relative to the
             stateof- the art.},
   Key = {fds356439}
}

@article{fds359956,
   Author = {Morucci, M and Orlandi, V and Rudin, C and Roy, S and Volfovsky,
             A},
   Title = {Adaptive Hyper-box Matching for Interpretable Individualized
             Treatment Effect Estimation},
   Journal = {CONFERENCE ON UNCERTAINTY IN ARTIFICIAL INTELLIGENCE (UAI
             2020)},
   Volume = {124},
   Pages = {1089-1098},
   Year = {2020},
   Key = {fds359956}
}

@article{fds349988,
   Author = {Awan, MU and Roy, S and Morucci, M and Rudin, C and Orlandi, V and Volfovsky, A},
   Title = {Almost-Matching-Exactly for Treatment Effect Estimation
             under Network Interference},
   Journal = {INTERNATIONAL CONFERENCE ON ARTIFICIAL INTELLIGENCE AND
             STATISTICS, VOL 108},
   Volume = {108},
   Pages = {3252-3261},
   Year = {2020},
   Key = {fds349988}
}

@article{fds359961,
   Author = {Gregory, H and Li, S and Mohammadi, P and Tarn, N and Draelos, R and Rudin,
             C},
   Title = {A Transformer Approach to Contextual Sarcasm Detection in
             Twitter},
   Journal = {FIGURATIVE LANGUAGE PROCESSING},
   Pages = {270-275},
   Year = {2020},
   Key = {fds359961}
}

@article{fds346696,
   Author = {Wang, F and Rudin, C and Mccormick, TH and Gore, JL},
   Title = {Modeling recovery curves with application to
             prostatectomy.},
   Journal = {Biostatistics (Oxford, England)},
   Volume = {20},
   Number = {4},
   Pages = {549-564},
   Year = {2019},
   Month = {October},
   url = {http://dx.doi.org/10.1093/biostatistics/kxy002},
   Abstract = {In many clinical settings, a patient outcome takes the form
             of a scalar time series with a recovery curve shape, which
             is characterized by a sharp drop due to a disruptive event
             (e.g., surgery) and subsequent monotonic smooth rise towards
             an asymptotic level not exceeding the pre-event value. We
             propose a Bayesian model that predicts recovery curves based
             on information available before the disruptive event. A
             recovery curve of interest is the quantified sexual function
             of prostate cancer patients after prostatectomy surgery. We
             illustrate the utility of our model as a pre-treatment
             medical decision aid, producing personalized predictions
             that are both interpretable and accurate. We uncover
             covariate relationships that agree with and supplement that
             in existing medical literature.},
   Doi = {10.1093/biostatistics/kxy002},
   Key = {fds346696}
}

@article{fds346811,
   Author = {Rudin, C},
   Title = {Do Simpler Models Exist and How Can We Find
             Them?},
   Journal = {Proceedings of the 25th ACM SIGKDD International Conference
             on Knowledge Discovery & Data Mining},
   Publisher = {ACM},
   Year = {2019},
   Month = {July},
   ISBN = {9781450362016},
   url = {http://dx.doi.org/10.1145/3292500.3330823},
   Doi = {10.1145/3292500.3330823},
   Key = {fds346811}
}

@article{fds348375,
   Author = {Ustun, B and Rudin, C},
   Title = {Learning optimized risk scores},
   Journal = {Journal of Machine Learning Research},
   Volume = {20},
   Year = {2019},
   Month = {June},
   Abstract = {Risk scores are simple classification models that let users
             make quick risk predictions by adding and subtracting a few
             small numbers. These models are widely used in medicine and
             criminal justice, but are difficult to learn from data
             because they need to be calibrated, sparse, use small
             integer coefficients, and obey application-specific
             constraints. In this paper, we introduce a machine learning
             method to learn risk scores. We formulate the risk score
             problem as a mixed integer nonlinear program, and present a
             cutting plane algorithm to recover its optimal solution. We
             improve our algorithm with specialized techniques that
             generate feasible solutions, narrow the optimality gap, and
             reduce data-related computation. Our algorithm can train
             risk scores in a way that scales linearly in the number of
             samples in a dataset, and that allows practitioners to
             address application-specific constraints without parameter
             tuning or post-processing. We benchmark the performance of
             different methods to learn risk scores on publicly available
             datasets, comparing risk scores produced by our method to
             risk scores built using methods that are used in practice.
             We also discuss the practical benefits of our method through
             a real-world application where we build a customized risk
             score for ICU seizure prediction in collaboration with the
             Massachusetts General Hospital.},
   Key = {fds348375}
}

@article{fds344810,
   Author = {Rudin, C and Shaposhnik, Y},
   Title = {Globally-Consistent Rule-Based Summary-Explanations for
             Machine Learning Models: Application to Credit-Risk
             Evaluation},
   Journal = {https://www.jmlr.org/papers/},
   Volume = {24},
   Year = {2019},
   Month = {May},
   Key = {fds344810}
}

@article{fds343582,
   Author = {Bravo, F and Rudin, C and Shaposhnik, Y and Yuan,
             Y},
   Title = {Simple Rules for Predicting Congestion Risk in Queueing
             Systems: Application to ICUs},
   Year = {2019},
   Month = {May},
   Key = {fds343582}
}

@article{fds351188,
   Author = {Rudin, C},
   Title = {Stop Explaining Black Box Machine Learning Models for High
             Stakes Decisions and Use Interpretable Models
             Instead.},
   Journal = {Nature machine intelligence},
   Volume = {1},
   Number = {5},
   Pages = {206-215},
   Year = {2019},
   Month = {May},
   url = {http://dx.doi.org/10.1038/s42256-019-0048-x},
   Abstract = {Black box machine learning models are currently being used
             for high stakes decision-making throughout society, causing
             problems throughout healthcare, criminal justice, and in
             other domains. People have hoped that creating methods for
             explaining these black box models will alleviate some of
             these problems, but trying to <i>explain</i> black box
             models, rather than creating models that are
             <i>interpretable</i> in the first place, is likely to
             perpetuate bad practices and can potentially cause
             catastrophic harm to society. There is a way forward - it is
             to design models that are inherently interpretable. This
             manuscript clarifies the chasm between explaining black
             boxes and using inherently interpretable models, outlines
             several key reasons why explainable black boxes should be
             avoided in high-stakes decisions, identifies challenges to
             interpretable machine learning, and provides several example
             applications where interpretable models could potentially
             replace black box models in criminal justice, healthcare,
             and computer vision.},
   Doi = {10.1038/s42256-019-0048-x},
   Key = {fds351188}
}

@article{fds346809,
   Author = {Dieng, A and Liu, Y and Roy, S and Rudin, C and Volfovsky,
             A},
   Title = {Interpretable Almost-Exact Matching for Causal
             Inference.},
   Journal = {Proceedings of machine learning research},
   Volume = {89},
   Pages = {2445-2453},
   Year = {2019},
   Month = {April},
   Abstract = {Matching methods are heavily used in the social and health
             sciences due to their interpretability. We aim to create the
             highest possible quality of treatment-control matches for
             categorical data in the potential outcomes framework. The
             method proposed in this work aims to match units on a
             weighted Hamming distance, taking into account the relative
             importance of the covariates; the algorithm aims to match
             units on as many relevant variables as possible. To do this,
             the algorithm creates a hierarchy of covariate combinations
             on which to match (similar to downward closure), in the
             process solving an optimization problem for each unit in
             order to construct the optimal matches. The algorithm uses a
             single dynamic program to solve all of the units'
             optimization problems simultaneously. Notable advantages of
             our method over existing matching procedures are its
             high-quality interpretable matches, versatility in handling
             different data distributions that may have irrelevant
             variables, and ability to handle missing data by matching on
             as many available covariates as possible.},
   Key = {fds346809}
}

@article{fds330649,
   Author = {Ban, GY and Rudin, C},
   Title = {The big Data newsvendor: Practical insights from machine
             learning},
   Journal = {Operations Research},
   Volume = {67},
   Number = {1},
   Pages = {90-108},
   Year = {2019},
   Month = {January},
   url = {http://dx.doi.org/10.1287/opre.2018.1757},
   Abstract = {We investigate the data-driven newsvendor problem when one
             has n observations of p features related to the demand as
             well as historical demand data. Rather than a two-step
             process of first estimating a demand distribution then
             optimizing for the optimal order quantity, we propose
             solving the “big data” newsvendor problem via
             single-step machine-learning algorithms. Specifically, we
             propose algorithms based on the empirical risk minimization
             (ERM) principle, with and without regularization, and an
             algorithm based on kernel-weights optimization (KO). The ERM
             approaches, equivalent to high-dimensional quantile
             regression, can be solved by convex optimization problems
             and the KO approach by a sorting algorithm. We analytically
             justify the use of features by showing that their omission
             yields inconsistent decisions. We then derive finite-sample
             performance bounds on the out-of-sample costs of the
             feature-based algorithms, which quantify the effects of
             dimensionality and cost parameters. Our bounds, based on
             algorithmic stability theory, generalize known analyses for
             the newsvendor problem without feature information. Finally,
             we apply the feature-based algorithms for nurse staffing in
             a hospital emergency room using a data set from a large UK
             teaching hospital and find that (1) the best ERM and KO
             algorithms beat the best practice benchmark by 23% and 24%,
             respectively, in the out-of-sample cost, and (2) the best KO
             algorithm is faster than the best ERM algorithm by three
             orders of magnitude and the best practice benchmark by two
             orders of magnitude.},
   Doi = {10.1287/opre.2018.1757},
   Key = {fds330649}
}

@article{fds346810,
   Author = {Usaid Awan and M and Liu, Y and Morucci, M and Roy, S and Rudin, C and Volfovsky, A},
   Title = {Interpretable almost-matching-exactly with instrumental
             variables},
   Journal = {35th Conference on Uncertainty in Artificial Intelligence,
             UAI 2019},
   Year = {2019},
   Month = {January},
   Abstract = {© 2019 Association For Uncertainty in Artificial
             Intelligence (AUAI). All rights reserved. Uncertainty in the
             estimation of the causal effect in observational studies is
             often due to unmeasured confounding, i.e., the presence of
             unobserved covariates linking treatments and outcomes.
             Instrumental Variables (IV) are commonly used to reduce the
             effects of unmeasured confounding. Existing methods for IV
             estimation either require strong parametric assumptions, use
             arbitrary distance metrics, or do not scale well to large
             datasets. We propose a matching framework for IV in the
             presence of observed categorical confounders that addresses
             these weaknesses. Our method first matches units exactly,
             and then consecutively drops variables to approximately
             match the remaining units on as many variables as possible.
             We show that our algorithm constructs better matches than
             other existing methods on simulated datasets, and we produce
             interesting results in an application to political
             canvassing.},
   Key = {fds346810}
}

@article{fds348374,
   Author = {Fisher, A and Rudin, C and Dominici, F},
   Title = {All Models are Wrong, but Many are Useful: Learning a
             Variable's Importance by Studying an Entire Class of
             Prediction Models Simultaneously.},
   Journal = {Journal of machine learning research : JMLR},
   Volume = {20},
   Pages = {177},
   Year = {2019},
   Month = {January},
   Abstract = {Variable importance (VI) tools describe how much covariates
             contribute to a prediction model's accuracy. However,
             important variables for one well-performing model (for
             example, a linear model <i>f</i> (x) = x <sup><i>T</i></sup>
             <i>β</i> with a fixed coefficient vector <i>β</i>) may be
             unimportant for another model. In this paper, we propose
             model class reliance (MCR) as the range of VI values across
             <i>all</i> well-performing model in a prespecified class.
             Thus, MCR gives a more comprehensive description of
             importance by accounting for the fact that many prediction
             models, possibly of different parametric forms, may fit the
             data well. In the process of deriving MCR, we show several
             informative results for permutation-based VI estimates,
             based on the VI measures used in Random Forests.
             Specifically, we derive connections between permutation
             importance estimates for a <i>single</i> prediction model,
             U-statistics, conditional variable importance, conditional
             causal effects, and linear model coefficients. We then give
             probabilistic bounds for MCR, using a novel, generalizable
             technique. We apply MCR to a public data set of Broward
             County criminal records to study the reliance of recidivism
             prediction models on sex and race. In this application, MCR
             can be used to help inform VI for unknown, proprietary
             models.},
   Key = {fds348374}
}

@article{fds372692,
   Author = {Parikh, H and Rudin, C and Volfovsky, A},
   Title = {An Application of Matching After Learning To Stretch (MALTS)
             to the ACIC 2018 Causal Inference Challenge
             Data},
   Journal = {Observational Studies},
   Volume = {5},
   Number = {2},
   Pages = {118-130},
   Year = {2019},
   Month = {January},
   url = {http://dx.doi.org/10.1353/obs.2019.0006},
   Abstract = {In the learning-to-match framework for causal inference, a
             parameterized distance metric is trained on a holdout train
             set so that the matching yields accurate estimated
             conditional average treatment effects. This way, the
             matching can be as accurate as other black box machine
             learning techniques for causal inference. We use a new
             learning-to-match algorithm called Matching-After-Learning-To-Stretch
             (MALTS) (Parikh et al., 2018) to study an observational
             dataset from the Atlantic Causal Inference Challenge. Other
             than pro-viding estimates for (conditional) average
             treatment effects, the MALTS procedure allows practitioners
             to evaluate matched groups directly, understand where more
             data might need to be collected and gain an understanding of
             when estimates can be trusted.1.},
   Doi = {10.1353/obs.2019.0006},
   Key = {fds372692}
}

@article{fds346697,
   Author = {Tracà, S and Rudin, C and Yan, W},
   Title = {Reducing exploration of dying arms in mortal
             bandits},
   Journal = {35th Conference on Uncertainty in Artificial Intelligence,
             UAI 2019},
   Year = {2019},
   Month = {January},
   Abstract = {© 2019 Air and Waste Management Association. All rights
             reserved. Mortal bandits have proven to be extremely useful
             for providing news article recommendations, running
             automated online advertising campaigns, and for other
             applications where the set of available options changes over
             time. Previous work on this problem showed how to regulate
             exploration of new arms when they have recently appeared,
             but they do not adapt when the arms are about to disappear.
             Since in most applications we can determine either exactly
             or approximately when arms will disappear, we can leverage
             this information to improve performance: we should not be
             exploring arms that are about to disappear. We provide
             adaptations of algorithms, regret bounds, and experiments
             for this study, showing a clear benefit from regulating
             greed (exploration/exploitation) for arms that will soon
             disappear. We illustrate numerical performance on the Yahoo!
             Front Page Today Module User Click Log Dataset.},
   Key = {fds346697}
}

@article{fds349522,
   Author = {Usaid Awan and M and Liu, Y and Morucci, M and Roy, S and Rudin, C and Volfovsky, A},
   Title = {Interpretable almost-matching-exactly with instrumental
             variables},
   Journal = {35th Conference on Uncertainty in Artificial Intelligence,
             UAI 2019},
   Year = {2019},
   Month = {January},
   Abstract = {Uncertainty in the estimation of the causal effect in
             observational studies is often due to unmeasured
             confounding, i.e., the presence of unobserved covariates
             linking treatments and outcomes. Instrumental Variables (IV)
             are commonly used to reduce the effects of unmeasured
             confounding. Existing methods for IV estimation either
             require strong parametric assumptions, use arbitrary
             distance metrics, or do not scale well to large datasets. We
             propose a matching framework for IV in the presence of
             observed categorical confounders that addresses these
             weaknesses. Our method first matches units exactly, and then
             consecutively drops variables to approximately match the
             remaining units on as many variables as possible. We show
             that our algorithm constructs better matches than other
             existing methods on simulated datasets, and we produce
             interesting results in an application to political
             canvassing.},
   Key = {fds349522}
}

@article{fds349523,
   Author = {Tracà, S and Rudin, C and Yan, W},
   Title = {Reducing exploration of dying arms in mortal
             bandits},
   Journal = {35th Conference on Uncertainty in Artificial Intelligence,
             UAI 2019},
   Year = {2019},
   Month = {January},
   Abstract = {Mortal bandits have proven to be extremely useful for
             providing news article recommendations, running automated
             online advertising campaigns, and for other applications
             where the set of available options changes over time.
             Previous work on this problem showed how to regulate
             exploration of new arms when they have recently appeared,
             but they do not adapt when the arms are about to disappear.
             Since in most applications we can determine either exactly
             or approximately when arms will disappear, we can leverage
             this information to improve performance: we should not be
             exploring arms that are about to disappear. We provide
             adaptations of algorithms, regret bounds, and experiments
             for this study, showing a clear benefit from regulating
             greed (exploration/exploitation) for arms that will soon
             disappear. We illustrate numerical performance on the Yahoo!
             Front Page Today Module User Click Log Dataset.},
   Key = {fds349523}
}

@article{fds352221,
   Author = {Chen, C and Li, O and Tao, C and Barnett, AJ and Su, J and Rudin,
             C},
   Title = {This looks like that: Deep learning for interpretable image
             recognition},
   Journal = {Advances in Neural Information Processing
             Systems},
   Volume = {32},
   Year = {2019},
   Month = {January},
   Abstract = {When we are faced with challenging image classification
             tasks, we often explain our reasoning by dissecting the
             image, and pointing out prototypical aspects of one class or
             another. The mounting evidence for each of the classes helps
             us make our final decision. In this work, we introduce a
             deep network architecture - prototypical part network
             (ProtoPNet), that reasons in a similar way: the network
             dissects the image by finding prototypical parts, and
             combines evidence from the prototypes to make a final
             classification. The model thus reasons in a way that is
             qualitatively similar to the way ornithologists, physicians,
             and others would explain to people on how to solve
             challenging image classification tasks. The network uses
             only image-level labels for training without any annotations
             for parts of images. We demonstrate our method on the
             CUB-200-2011 dataset and the Stanford Cars dataset. Our
             experiments show that ProtoPNet can achieve comparable
             accuracy with its analogous non-interpretable counterpart,
             and when several ProtoPNets are combined into a larger
             network, it can achieve an accuracy that is on par with some
             of the best-performing deep models. Moreover, ProtoPNet
             provides a level of interpretability that is absent in other
             interpretable deep models.},
   Key = {fds352221}
}

@article{fds352220,
   Author = {Hu, X and Rudin, C and Seltzer, M},
   Title = {Optimal sparse decision trees},
   Journal = {Advances in Neural Information Processing
             Systems},
   Volume = {32},
   Year = {2019},
   Month = {January},
   Abstract = {Decision tree algorithms have been among the most popular
             algorithms for interpretable (transparent) machine learning
             since the early 1980's. The problem that has plagued
             decision tree algorithms since their inception is their lack
             of optimality, or lack of guarantees of closeness to
             optimality: decision tree algorithms are often greedy or
             myopic, and sometimes produce unquestionably suboptimal
             models. Hardness of decision tree optimization is both a
             theoretical and practical obstacle, and even careful
             mathematical programming approaches have not been able to
             solve these problems efficiently. This work introduces the
             first practical algorithm for optimal decision trees for
             binary variables. The algorithm is a co-design of analytical
             bounds that reduce the search space and modern systems
             techniques, including data structures and a custom
             bit-vector library. Our experiments highlight advantages in
             scalability, speed, and proof of optimality.},
   Key = {fds352220}
}

@article{fds373971,
   Author = {Hase, P and Chen, C and Li, O and Rudin, C},
   Title = {Interpretable Image Recognition with Hierarchical
             Prototypes},
   Journal = {Proceedings of the AAAI Conference on Human Computation and
             Crowdsourcing},
   Volume = {7},
   Pages = {32-40},
   Year = {2019},
   Month = {January},
   ISBN = {9781577358206},
   url = {http://dx.doi.org/10.1609/hcomp.v7i1.5265},
   Abstract = {Vision models are interpretable when they classify objects
             on the basis of features that a person can directly
             understand. Recently, methods relying on visual feature
             prototypes have been developed for this purpose. However, in
             contrast to how humans categorize objects, these approaches
             have not yet made use of any taxonomical organization of
             class labels. With such an approach, for instance, we may
             see why a chimpanzeeis classified as a chimpanzee, but not
             why it was considered to be a primate or even an animal. In
             this work we introduce a model that uses hierarchically
             organized prototypes to classify objects at every level in a
             predefined taxonomy. Hence, we may find distinct
             explanations for the prediction animage receives at each
             level of the taxonomy. The hierarchical prototypes enable
             the model to perform another important task: interpretably
             classifying images from previously unseen classes at the
             level of the taxonomy to which they correctlyrelate, e.g.
             classifying a hand gun as a weapon, when the only weapons in
             the training data are rifles. With a subset of Image Net, we
             test our model against its counterpart black-box model on
             two tasks: 1) classification of data from familiar classes,
             and 2) classification of data from previously unseen classes
             at the appropriate level in the taxonomy. We find that our
             model performs approximately as well as its counterpart
             black-box model while allowing for each classification to
             beinterpreted.},
   Doi = {10.1609/hcomp.v7i1.5265},
   Key = {fds373971}
}

@article{fds371425,
   Author = {Awan, MU and Liu, Y and Morucci, M and Roy, S and Rudin, C and Volfovsky,
             A},
   Title = {Interpretable Almost-Matching-Exactly With Instrumental
             Variables},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {115},
   Pages = {1116-1126},
   Year = {2019},
   Month = {January},
   Abstract = {Uncertainty in the estimation of the causal effect in
             observational studies is often due to unmeasured
             confounding, i.e., the presence of unobserved covariates
             linking treatments and outcomes. Instrumental Variables (IV)
             are commonly used to reduce the effects of unmeasured
             confounding. Existing methods for IV estimation either
             require strong parametric assumptions, use arbitrary
             distance metrics, or do not scale well to large datasets. We
             propose a matching framework for IV in the presence of
             observed categorical confounders that addresses these
             weaknesses. Our method first matches units exactly, and then
             consecutively drops variables to approximately match the
             remaining units on as many variables as possible. We show
             that our algorithm constructs better matches than other
             existing methods on simulated datasets, and we produce
             interesting results in an application to political
             canvassing.},
   Key = {fds371425}
}

@article{fds371426,
   Author = {Tracà, S and Rudin, C and Yan, W},
   Title = {Reducing Exploration of Dying Arms in Mortal
             Bandits},
   Journal = {Proceedings of Machine Learning Research},
   Volume = {115},
   Pages = {156-163},
   Year = {2019},
   Month = {January},
   Abstract = {Mortal bandits have proven to be extremely useful for
             providing news article recommendations, running automated
             online advertising campaigns, and for other applications
             where the set of available options changes over time.
             Previous work on this problem showed how to regulate
             exploration of new arms when they have recently appeared,
             but they do not adapt when the arms are about to disappear.
             Since in most applications we can determine either exactly
             or approximately when arms will disappear, we can leverage
             this information to improve performance: we should not be
             exploring arms that are about to disappear. We provide
             adaptations of algorithms, regret bounds, and experiments
             for this study, showing a clear benefit from regulating
             greed (exploration/exploitation) for arms that will soon
             disappear. We illustrate numerical performance on the Yahoo!
             Front Page Today Module User Click Log Dataset.},
   Key = {fds371426}
}

@article{fds342581,
   Author = {Bei, Y and Damian, A and Hu, S and Menon, S and Ravi, N and Rudin,
             C},
   Title = {New techniques for preserving global structure and denoising
             with low information loss in single-image
             super-resolution},
   Journal = {IEEE Computer Society Conference on Computer Vision and
             Pattern Recognition Workshops},
   Volume = {2018-June},
   Pages = {987-994},
   Year = {2018},
   Month = {December},
   ISBN = {9781538661000},
   url = {http://dx.doi.org/10.1109/CVPRW.2018.00132},
   Abstract = {This work identifies and addresses two important technical
             challenges in single-image super-resolution: (1) how to
             upsample an image without magnifying noise and (2) how to
             preserve large scale structure when upsampling. We summarize
             the techniques we developed for our second place entry in
             Track 1 (Bicubic Downsampling), seventh place entry in Track
             2 (Realistic Adverse Conditions), and seventh place entry in
             Track 3 (Realistic difficult) in the 2018 NTIRE
             Super-Resolution Challenge. Furthermore, we present new
             neural network architectures that specifically address the
             two challenges listed above: denoising and preservation of
             large-scale structure.},
   Doi = {10.1109/CVPRW.2018.00132},
   Key = {fds342581}
}

@article{fds346812,
   Author = {Timofte, R and Gu, S and Wu, J and Van Gool and L and Zhang, L and Yang, MH and Haris, M and Shakhnarovich, G and Ukita, N and Hu, S and Bei, Y and Hui, Z and Jiang, X and Gu, Y and Liu, J and Wang, Y and Perazzi, F and McWilliams, B and Sorkine-Hornung, A and Sorkine-Hornung, O and Schroers, C and Yu, J and Fan, Y and Yang, J and Xu, N and Wang, Z and Wang, X and Huang, TS and Yu, K and Hui, TW and Dong, C and Lin, L and Loy, CC and Park, D and Kim, K and Chun,
             SY and Zhang, K and Liu, P and Zuo, W and Guo, S and Xu, J and Liu, Y and Xiong,
             F and Dong, Y and Bai, H and Damian, A and Ravi, N and Menon, S and Rudin, C and Seo, J and Jeon, T and Koo, J and Jeon, S and Kim, SY and Choi, JS and Ki, S and Seo, S and Sim, H and Kim, S and Kim, M and Chen, R and Zeng, K and Guo, J and Qu, Y and Li, C and Ahn, N and Kang, B and Sohn, KA and Yuan, Y and Zhang, J and Pang, J and Xu, X and Zhao, Y and Deng, W and Ul Hussain and S and Aadil, M and Rahim, R and Cai, X and Huang, F and Xu, Y and Michelini, PN and Zhu, D and Liu, H and Kim, JH and Lee, JS and Huang, Y and Qiu, M and Jing, L and Zeng,
             J and Sharma, M and Mukhopadhyay, R and Upadhyay, A and Koundinya, S and Shukla, A and Chaudhury, S and Zhang, Z and Hu, YH},
   Title = {NTIRE 2018 challenge on single image super-resolution:
             Methods and results},
   Journal = {IEEE Computer Society Conference on Computer Vision and
             Pattern Recognition Workshops},
   Volume = {2018-June},
   Pages = {965-976},
   Publisher = {IEEE},
   Year = {2018},
   Month = {December},
   ISBN = {9781538661000},
   url = {http://dx.doi.org/10.1109/CVPRW.2018.00130},
   Abstract = {This paper reviews the 2nd NTIRE challenge on single image
             super-resolution (restoration of rich details in a low
             resolution image) with focus on proposed solutions and
             results. The challenge had 4 tracks. Track 1 employed the
             standard bicubic downscaling setup, while Tracks 2, 3 and 4
             had realistic unknown downgrading operators simulating
             camera image acquisition pipeline. The operators were
             learnable through provided pairs of low and high resolution
             train images. The tracks had 145, 114, 101, and 113
             registered participants, resp., and 31 teams competed in the
             final testing phase. They gauge the state-of-the-art in
             single image super-resolution.},
   Doi = {10.1109/CVPRW.2018.00130},
   Key = {fds346812}
}

@article{fds339230,
   Author = {Rudin, C and Ertekin, Ş},
   Title = {Learning customized and optimized lists of rules with
             mathematical programming},
   Journal = {Mathematical Programming Computation},
   Volume = {10},
   Number = {4},
   Pages = {659-702},
   Publisher = {Springer Nature America, Inc},
   Year = {2018},
   Month = {December},
   url = {http://dx.doi.org/10.1007/s12532-018-0143-8},
   Abstract = {We introduce a mathematical programming approach to building
             rule lists, which are a type of interpretable, nonlinear,
             and logical machine learning classifier involving IF-THEN
             rules. Unlike traditional decision tree algorithms like CART
             and C5.0, this method does not use greedy splitting and
             pruning. Instead, it aims to fully optimize a combination of
             accuracy and sparsity, obeying user-defined constraints.
             This method is useful for producing non-black-box predictive
             models, and has the benefit of a clear user-defined tradeoff
             between training accuracy and sparsity. The flexible
             framework of mathematical programming allows users to create
             customized models with a provable guarantee of optimality.
             The software reviewed as part of this submission was given
             the DOI (Digital Object Identifier) https://doi.org/10.5281/zenodo.1344142.},
   Doi = {10.1007/s12532-018-0143-8},
   Key = {fds339230}
}

@article{fds361531,
   Author = {Parikh, H and Rudin, C and Volfovsky, A},
   Title = {MALTS: Matching After Learning to Stretch},
   Journal = {Journal.of.Machine.Learning.Research 23(240) (2022)
             1-42},
   Year = {2018},
   Month = {November},
   Abstract = {We introduce a flexible framework that produces high-quality
             almost-exact matches for causal inference. Most prior work
             in matching uses ad-hoc distance metrics, often leading to
             poor quality matches, particularly when there are irrelevant
             covariates. In this work, we learn an interpretable distance
             metric for matching, which leads to substantially higher
             quality matches. The learned distance metric stretches the
             covariate space according to each covariate's contribution
             to outcome prediction: this stretching means that mismatches
             on important covariates carry a larger penalty than
             mismatches on irrelevant covariates. Our ability to learn
             flexible distance metrics leads to matches that are
             interpretable and useful for the estimation of conditional
             average treatment effects.},
   Key = {fds361531}
}

@article{fds339633,
   Author = {Rudin, C and Ustunb, B},
   Title = {Optimized scoring systems: Toward trust in machine learning
             for healthcare and criminal justice},
   Journal = {Interfaces},
   Volume = {48},
   Number = {5},
   Pages = {449-466},
   Publisher = {Institute for Operations Research and the Management
             Sciences (INFORMS)},
   Year = {2018},
   Month = {September},
   url = {http://dx.doi.org/10.1287/inte.2018.0957},
   Abstract = {Abstract. Questions of trust in machine-learning models are
             becoming increasingly important as these tools are starting
             to be used widely for high-stakes decisions in medicine and
             criminal justice. Transparency of models is a key aspect
             affecting trust. This paper reveals that there is new
             technology to build transparent machine-learning models that
             are often as accurate as black-box machine-learning models.
             These methods have already had an impact in medicine and
             criminal justice. This work calls into question the overall
             need for black-box models in these applications.
             Copyright:},
   Doi = {10.1287/inte.2018.0957},
   Key = {fds339633}
}

@article{fds346813,
   Author = {Liu, Y and Dieng, A and Roy, S and Rudin, C and Volfovsky,
             A},
   Title = {Interpretable Almost Matching Exactly for Causal
             Inference},
   Volume = {abs/1806.06802},
   Year = {2018},
   Month = {June},
   Abstract = {We aim to create the highest possible quality of
             treatment-control matches for categorical data in the
             potential outcomes framework. Matching methods are heavily
             used in the social sciences due to their interpretability,
             but most matching methods do not pass basic sanity checks:
             they fail when irrelevant variables are introduced, and tend
             to be either computationally slow or produce low-quality
             matches. The method proposed in this work aims to match
             units on a weighted Hamming distance, taking into account
             the relative importance of the covariates; the algorithm
             aims to match units on as many relevant variables as
             possible. To do this, the algorithm creates a hierarchy of
             covariate combinations on which to match (similar to
             downward closure), in the process solving an optimization
             problem for each unit in order to construct the optimal
             matches. The algorithm uses a single dynamic program to
             solve all of the optimization problems simultaneously.
             Notable advantages of our method over existing matching
             procedures are its high-quality matches, versatility in
             handling different data distributions that may have
             irrelevant variables, and ability to handle missing data by
             matching on as many available covariates as
             possible.},
   Key = {fds346813}
}

@article{fds332384,
   Author = {Vu, M-AT and Adalı, T and Ba, D and Buzsáki, G and Carlson, D and Heller,
             K and Liston, C and Rudin, C and Sohal, VS and Widge, AS and Mayberg, HS and Sapiro, G and Dzirasa, K},
   Title = {A Shared Vision for Machine Learning in Neuroscience.},
   Journal = {J Neurosci},
   Volume = {38},
   Number = {7},
   Pages = {1601-1607},
   Publisher = {Society for Neuroscience},
   Year = {2018},
   Month = {February},
   url = {http://dx.doi.org/10.1523/JNEUROSCI.0508-17.2018},
   Abstract = {With ever-increasing advancements in technology,
             neuroscientists are able to collect data in greater volumes
             and with finer resolution. The bottleneck in understanding
             how the brain works is consequently shifting away from the
             amount and type of data we can collect and toward what we
             actually do with the data. There has been a growing interest
             in leveraging this vast volume of data across levels of
             analysis, measurement techniques, and experimental paradigms
             to gain more insight into brain function. Such efforts are
             visible at an international scale, with the emergence of big
             data neuroscience initiatives, such as the BRAIN initiative
             (Bargmann et al., 2014), the Human Brain Project, the Human
             Connectome Project, and the National Institute of Mental
             Health's Research Domain Criteria initiative. With these
             large-scale projects, much thought has been given to
             data-sharing across groups (Poldrack and Gorgolewski, 2014;
             Sejnowski et al., 2014); however, even with such
             data-sharing initiatives, funding mechanisms, and
             infrastructure, there still exists the challenge of how to
             cohesively integrate all the data. At multiple stages and
             levels of neuroscience investigation, machine learning holds
             great promise as an addition to the arsenal of analysis
             tools for discovering how the brain works.},
   Doi = {10.1523/JNEUROSCI.0508-17.2018},
   Key = {fds332384}
}

@article{fds338419,
   Author = {Angelino, E and Larus-Stone, N and Alabi, D and Seltzer, M and Rudin,
             C},
   Title = {Learning certifiably optimal rule lists for categorical
             data},
   Journal = {Journal of Machine Learning Research},
   Volume = {18},
   Pages = {1-78},
   Year = {2018},
   Month = {January},
   Abstract = {We present the design and implementation of a custom
             discrete optimization technique for building rule lists over
             a categorical feature space. Our algorithm produces rule
             lists with optimal training performance, according to the
             regularized empirical risk, with a certificate of
             optimality. By leveraging algorithmic bounds, efficient data
             structures, and computational reuse, we achieve several
             orders of magnitude speedup in time and a massive reduction
             of memory consumption. We demonstrate that our approach
             produces optimal rule lists on practical problems in
             seconds. Our results indicate that it is possible to
             construct optimal sparse rule lists that are approximately
             as accurate as the COMPAS proprietary risk prediction tool
             on data from Broward County, Florida, but that are
             completely interpretable. This framework is a novel
             alternative to CART and other decision tree methods for
             interpretable modeling.},
   Key = {fds338419}
}

@article{fds341025,
   Author = {Li, O and Liu, H and Chen, C and Rudin, C},
   Title = {Deep learning for case-based reasoning through prototypes: A
             neural network that explains its predictions},
   Journal = {32nd AAAI Conference on Artificial Intelligence, AAAI
             2018},
   Pages = {3530-3537},
   Year = {2018},
   Month = {January},
   ISBN = {9781577358008},
   Abstract = {Deep neural networks are widely used for classification.
             These deep models often suffer from a lack of
             interpretability - they are particularly difficult to
             understand because of their non-linear nature. As a result,
             neural networks are often treated as “black box” models,
             and in the past, have been trained purely to optimize the
             accuracy of predictions. In this work, we create a novel
             network architecture for deep learning that naturally
             explains its own reasoning for each prediction. This
             architecture contains an autoencoder and a special prototype
             layer, where each unit of that layer stores a weight vector
             that resembles an encoded training input. The encoder of the
             autoencoder allows us to do comparisons within the latent
             space, while the decoder allows us to visualize the learned
             prototypes. The training objective has four terms: an
             accuracy term, a term that encourages every prototype to be
             similar to at least one encoded input, a term that
             encourages every encoded input to be close to at least one
             prototype, and a term that encourages faithful
             reconstruction by the autoencoder. The distances computed in
             the prototype layer are used as part of the classification
             process. Since the prototypes are learned during training,
             the learned network naturally comes with explanations for
             each prediction, and the explanations are loyal to what the
             network actually computes.},
   Key = {fds341025}
}

@article{fds344609,
   Author = {Chen, C and Rudin, C},
   Title = {An optimization approach to learning falling rule
             lists},
   Journal = {International Conference on Artificial Intelligence and
             Statistics, AISTATS 2018},
   Pages = {604-612},
   Year = {2018},
   Month = {January},
   Abstract = {A falling rule list is a probabilistic decision list for
             binary classification, consisting of a series of if-then
             rules with antecedents in the if clauses and probabilities
             of the desired outcome (“1”) in the then clauses. Just
             as in a regular decision list, the order of rules in a
             falling rule list is important – each example is
             classified by the first rule whose antecedent it satisfies.
             Unlike a regular decision list, a falling rule list requires
             the probabilities of the desired outcome (“1”) to be
             monotonically decreasing down the list. We propose an
             optimization approach to learning falling rule lists and
             “softly” falling rule lists, along with Monte-Carlo
             search algorithms that use bounds on the optimal solution to
             prune the search space.},
   Key = {fds344609}
}

@article{fds344608,
   Author = {Rudin, C and Wang, Y},
   Title = {Direct learning to rank and rerank},
   Journal = {International Conference on Artificial Intelligence and
             Statistics, AISTATS 2018},
   Pages = {775-783},
   Year = {2018},
   Month = {January},
   Abstract = {Learning-to-rank techniques have proven to be extremely
             useful for prioritization problems, where we rank items in
             order of their estimated probabilities, and dedicate our
             limited resources to the top-ranked items. This work exposes
             a serious problem with the state of learning-to-rank
             algorithms, which is that they are based on convex proxies
             that lead to poor approximations. We then discuss the
             possibility of “exact” reranking algorithms based on
             mathematical programming. We prove that a relaxed version of
             the “exact” problem has the same optimal solution, and
             provide an empirical analysis.},
   Key = {fds344608}
}

@article{fds331943,
   Author = {Struck, AF and Ustun, B and Ruiz, AR and Lee, JW and LaRoche, SM and Hirsch, LJ and Gilmore, EJ and Vlachy, J and Haider, HA and Rudin, C and Westover, MB},
   Title = {Association of an Electroencephalography-Based Risk Score
             With Seizure Probability in Hospitalized
             Patients.},
   Journal = {JAMA neurology},
   Volume = {74},
   Number = {12},
   Pages = {1419-1424},
   Year = {2017},
   Month = {December},
   url = {http://dx.doi.org/10.1001/jamaneurol.2017.2459},
   Abstract = {<h4>Importance</h4>Continuous electroencephalography (EEG)
             use in critically ill patients is expanding. There is no
             validated method to combine risk factors and guide
             clinicians in assessing seizure risk.<h4>Objective</h4>To
             use seizure risk factors from EEG and clinical history to
             create a simple scoring system associated with the
             probability of seizures in patients with acute
             illness.<h4>Design, setting, and participants</h4>We used a
             prospective multicenter (Emory University Hospital, Brigham
             and Women's Hospital, and Yale University Hospital) database
             containing clinical and electrographic variables on 5427
             continuous EEG sessions from eligible patients if they had
             continuous EEG for clinical indications, excluding epilepsy
             monitoring unit admissions. We created a scoring system
             model to estimate seizure risk in acutely ill patients
             undergoing continuous EEG. The model was built using a new
             machine learning method (RiskSLIM) that is designed to
             produce accurate, risk-calibrated scoring systems with a
             limited number of variables and small integer weights. We
             validated the accuracy and risk calibration of our model
             using cross-validation and compared its performance with
             models built with state-of-the-art logistic regression
             methods. The database was developed by the Critical Care EEG
             Research Consortium and used data collected over 3 years.
             The EEG variables were interpreted using standardized
             terminology by certified reviewers.<h4>Exposures</h4>All
             patients had more than 6 hours of uninterrupted EEG
             recordings.<h4>Main outcomes and measures</h4>The main
             outcome was the average risk calibration
             error.<h4>Results</h4>There were 5427 continuous EEGs
             performed on 4772 participants (2868 men, 49.9%; median age,
             61 years) performed at 3 institutions, without further
             demographic stratification. Our final model, 2HELPS2B, had
             an area under the curve of 0.819 and average calibration
             error of 2.7% (95% CI, 2.0%-3.6%). It included 6 variables
             with the following point assignments: (1) brief (ictal)
             rhythmic discharges (B[I]RDs) (2 points); (2) presence of
             lateralized periodic discharges, lateralized rhythmic delta
             activity, or bilateral independent periodic discharges (1
             point); (3) prior seizure (1 point); (4) sporadic
             epileptiform discharges (1 point); (5) frequency greater
             than 2.0 Hz for any periodic or rhythmic pattern (1 point);
             and (6) presence of "plus" features (superimposed, rhythmic,
             sharp, or fast activity) (1 point). The probable seizure
             risk of each score was 5% for a score of 0, 12% for a score
             of 1, 27% for a score of 2, 50% for a score of 3, 73% for a
             score of 4, 88% for a score of 5, and greater than 95% for a
             score of 6 or 7.<h4>Conclusions and relevance</h4>The
             2HELPS2B model is a quick accurate tool to aid clinical
             judgment of the risk of seizures in critically ill
             patients.},
   Doi = {10.1001/jamaneurol.2017.2459},
   Key = {fds331943}
}

@article{fds330620,
   Author = {Ustun, B and Rudin, C},
   Title = {Optimized risk scores},
   Journal = {Proceedings of the ACM SIGKDD International Conference on
             Knowledge Discovery and Data Mining},
   Volume = {Part F129685},
   Pages = {1125-1134},
   Year = {2017},
   Month = {August},
   ISBN = {9781450348874},
   url = {http://dx.doi.org/10.1145/3097983.3098161},
   Abstract = {Risk scores are simple classification models that let users
             quickly assess risk by adding, subtracting, and multiplying
             a few small numbers. Such models are widely used in
             healthcare and criminal justice, but are often built ad hoc.
             In this paper, we present a principled approach to learn
             risk scores that are fully optimized for feature selection,
             integer coefficients, and operational constraints. We
             formulate the risk score problem as a mixed integer
             nonlinear program, and present a new cutting plane algorithm
             to efficiently recover its optimal solution. Our approach
             can fit optimized risk scores in a way that scales linearly
             with the sample size of a dataset, provides a proof of
             optimality, and obeys complex constraints without parameter
             tuning. We illustrate these benefits through an extensive
             set of numerical experiments, and an application where we
             build a customized risk score for ICU seizure
             prediction.},
   Doi = {10.1145/3097983.3098161},
   Key = {fds330620}
}

@article{fds330621,
   Author = {Angelino, E and Larus-Stone, N and Alabi, D and Seltzer, M and Rudin,
             C},
   Title = {Learning certifiably optimal rule lists},
   Journal = {Proceedings of the ACM SIGKDD International Conference on
             Knowledge Discovery and Data Mining},
   Volume = {Part F129685},
   Pages = {35-44},
   Publisher = {ACM Press},
   Year = {2017},
   Month = {August},
   ISBN = {9781450348874},
   url = {http://dx.doi.org/10.1145/3097983.3098047},
   Abstract = {We present the design and implementation of a custom
             discrete optimization technique for building rule lists over
             a categorical feature space. Our algorithm provides the
             optimal solution, with a certificate of optimality. By
             leveraging algorithmic bounds, efficient data structures,
             and computational reuse, we achieve several orders of
             magnitude speedup in time and a massive reduction of memory
             consumption. We demonstrate that our approach produces
             optimal rule lists on practical problems in seconds. This
             framework is a novel alternative to CART and other decision
             tree methods.},
   Doi = {10.1145/3097983.3098047},
   Key = {fds330621}
}

@article{fds330622,
   Author = {Wang, T and Rudin, C and Doshi-Velez, F and Liu, Y and Klampfl, E and MacNeille, P},
   Title = {A Bayesian framework for learning rule sets for
             interpretable classification},
   Journal = {Journal of Machine Learning Research},
   Volume = {18},
   Pages = {1-37},
   Year = {2017},
   Month = {August},
   Abstract = {We present a machine learning algorithm for building
             classifiers that are comprised of a small number of short
             rules. These are restricted disjunctive normal form models.
             An example of a classifier of this form is as follows: If X
             satisfies (condition A AND condition B) OR (condition C) OR
             · · · , then Y = 1. Models of this form have the
             advantage of being interpretable to human experts since they
             produce a set of rules that concisely describe a specific
             class. We present two probabilistic models with prior
             parameters that the user can set to encourage the model to
             have a desired size and shape, to conform with a
             domain-specific definition of interpretability. We provide a
             scalable MAP inference approach and develop theoretical
             bounds to reduce computation by iteratively pruning the
             search space. We apply our method (Bayesian Rule Sets –
             BRS) to characterize and predict user behavior with respect
             to in-vehicle context-aware personalized recommender
             systems. Our method has a major advantage over classical
             associative classification methods and decision trees in
             that it does not greedily grow the model.},
   Key = {fds330622}
}

@article{fds330623,
   Author = {Letham, B and Letham, PA and Rudin, C and Browne,
             EP},
   Title = {Erratum: "Prediction uncertainty and optimal experimental
             design for learning dynamical systems" [Chaos 26, 063110
             (2016)].},
   Journal = {Chaos (Woodbury, N.Y.)},
   Volume = {27},
   Number = {6},
   Pages = {069901},
   Year = {2017},
   Month = {June},
   url = {http://dx.doi.org/10.1063/1.4986799},
   Doi = {10.1063/1.4986799},
   Key = {fds330623}
}

@article{fds330624,
   Author = {Zeng, J and Ustun, B and Rudin, C},
   Title = {Interpretable classification models for recidivism
             prediction},
   Journal = {Journal of the Royal Statistical Society. Series A:
             Statistics in Society},
   Volume = {180},
   Number = {3},
   Pages = {689-722},
   Publisher = {WILEY},
   Year = {2017},
   Month = {June},
   url = {http://dx.doi.org/10.1111/rssa.12227},
   Abstract = {We investigate a long-debated question, which is how to
             create predictive models of recidivism that are sufficiently
             accurate, transparent and interpretable to use for decision
             making. This question is complicated as these models are
             used to support different decisions, from sentencing, to
             determining release on probation to allocating preventative
             social services. Each case might have an objective other
             than classification accuracy, such as a desired true
             positive rate TPR or false positive rate FPR. Each (TPR,
             FPR) pair is a point on the receiver operator characteristic
             (ROC) curve. We use popular machine learning methods to
             create models along the full ROC curve on a wide range of
             recidivism prediction problems. We show that many methods
             (support vector machines, stochastic gradient boosting and
             ridge regression) produce equally accurate models along the
             full ROC curve. However, methods that are designed for
             interpretability (classification and regression trees and
             C5.0) cannot be tuned to produce models that are accurate
             and/or interpretable. To handle this shortcoming, we use a
             recent method called supersparse linear integer models to
             produce accurate, transparent and interpretable scoring
             systems along the full ROC curve. These scoring systems can
             be used for decision making for many different use cases,
             since they are just as accurate as the most powerful black
             box machine learning models for many applications, but
             completely transparent, and highly interpretable.},
   Doi = {10.1111/rssa.12227},
   Key = {fds330624}
}

@article{fds330625,
   Author = {Ustun, B and Adler, LA and Rudin, C and Faraone, SV and Spencer, TJ and Berglund, P and Gruber, MJ and Kessler, RC},
   Title = {The World Health Organization Adult Attention-Deficit/Hyperactivity
             Disorder Self-Report Screening Scale for
             DSM-5.},
   Journal = {JAMA psychiatry},
   Volume = {74},
   Number = {5},
   Pages = {520-527},
   Year = {2017},
   Month = {May},
   url = {http://dx.doi.org/10.1001/jamapsychiatry.2017.0298},
   Abstract = {<h4>Importance</h4>Recognition that adult
             attention-deficit/hyperactivity disorder (ADHD) is common,
             seriously impairing, and usually undiagnosed has led to the
             development of adult ADHD screening scales for use in
             community, workplace, and primary care settings. However,
             these scales are all calibrated to DSM-IV criteria, which
             are narrower than the recently developed DSM-5
             criteria.<h4>Objectives</h4>To update for DSM-5 criteria and
             improve the operating characteristics of the widely used
             World Health Organization Adult ADHD Self-Report Scale
             (ASRS) for screening.<h4>Design, setting, and
             participants</h4>Probability subsamples of participants in 2
             general population surveys (2001-2003 household survey
             [n = 119] and 2004-2005 managed care subscriber survey
             [n = 218]) who completed the full 29-question
             self-report ASRS, with both subsamples over-sampling
             ASRS-screened positives, were blindly administered a
             semistructured research diagnostic interview for DSM-5 adult
             ADHD. In 2016, the Risk-Calibrated Supersparse Linear
             Integer Model, a novel machine-learning algorithm designed
             to create screening scales with optimal integer weights and
             limited numbers of screening questions, was applied to the
             pooled data to create a DSM-5 version of the ASRS screening
             scale. The accuracy of the new scale was then confirmed in
             an independent 2011-2012 clinical sample of patients seeking
             evaluation at the New York University Langone Medical Center
             Adult ADHD Program (NYU Langone) and 2015-2016 primary care
             controls (n = 300). Data analysis was conducted from
             April 4, 2016, to September 22, 2016.<h4>Main outcomes and
             measures</h4>The sensitivity, specificity, area under the
             curve (AUC), and positive predictive value (PPV) of the
             revised ASRS.<h4>Results</h4>Of the total 637 participants,
             44 (37.0%) household survey respondents, 51 (23.4%) managed
             care respondents, and 173 (57.7%) NYU Langone respondents
             met DSM-5 criteria for adult ADHD in the semistructured
             diagnostic interview. Of the respondents who met DSM-5
             criteria for adult ADHD, 123 were male (45.9%); mean (SD)
             age was 33.1 (11.4) years. A 6-question screening scale was
             found to be optimal in distinguishing cases from noncases in
             the first 2 samples. Operating characteristics were
             excellent at the diagnostic threshold in the weighted (to
             the 8.2% DSM-5/Adult ADHD Clinical Diagnostic Scale
             population prevalence) data (sensitivity, 91.4%;
             specificity, 96.0%; AUC, 0.94; PPV, 67.3%). Operating
             characteristics were similar despite a much higher
             prevalence (57.7%) when the scale was applied to the NYU
             Langone clinical sample (sensitivity, 91.9%; specificity,
             74.0%; AUC, 0.83; PPV, 82.8%).<h4>Conclusions and
             relevance</h4>The new ADHD screening scale is short, easily
             scored, detects the vast majority of general population
             cases at a threshold that also has high specificity and PPV,
             and could be used as a screening tool in specialty treatment
             settings.},
   Doi = {10.1001/jamapsychiatry.2017.0298},
   Key = {fds330625}
}

@article{fds336348,
   Author = {Yang, H and Rudin, C and Seltzer, M},
   Title = {Scalable Bayesian rule lists},
   Journal = {34th International Conference on Machine Learning, ICML
             2017},
   Volume = {8},
   Pages = {5971-5980},
   Year = {2017},
   Month = {January},
   ISBN = {9781510855144},
   Abstract = {We present an algorithm for building probabilistic rule
             lists that is two orders of magnitude faster than previous
             work. Rule list algorithms are competitors for decision tree
             algorithms. They are associative classifiers, in that they
             are built from pre-mined association rules. They have a
             logical structure that is a sequence of IF-THEN rules,
             identical to a decision list or one-sided decision tree.
             Instead of using greedy splitting and pruning like decision
             tree algorithms, we aim to fully optimize over rule lists,
             striking a practical balance between accuracy,
             inter-pretability, and computational speed. The algorithm
             presented here uses a mixture of theoretical bounds (tight
             enough to have practical implications as a screening or
             bounding procedure), computational reuse, and highly tuned
             language libraries to achieve computational efficiency.
             Currently, for many practical problems, this method achieves
             better accuracy and sparsity than decision trees, with
             practical running times. The predictions in each leaf are
             probabilistic.},
   Key = {fds336348}
}

@article{fds344610,
   Author = {Lakkaraju, H and Rudin, C},
   Title = {Learning cost-effective and interpretable treatment
             regimes},
   Journal = {Proceedings of the 20th International Conference on
             Artificial Intelligence and Statistics, AISTATS
             2017},
   Year = {2017},
   Month = {January},
   Abstract = {© 2017 PMLR. All rights reserved. Decision makers, such as
             doctors and judges, make crucial decisions such as
             recommending treatments to patients, and granting bail to
             defendants on a daily basis. Such decisions typically
             involve weighing the potential benefits of taking an action
             against the costs involved. In this work, we aim to automate
             this task of learning cost-effective, interpretable and
             actionable treatment regimes. We formulate this as a problem
             of learning a decision list – a sequence of if-then-else
             rules – that maps characteristics of subjects (eg.,
             diagnostic test results of patients) to treatments. This
             yields an end-to-end individualized policy for tests and
             treatments. We propose a novel objective to construct a
             decision list which maximizes outcomes for the population,
             and minimizes overall costs. Since we do not observe the
             outcomes corresponding to counterfactual scenarios, we use
             techniques from causal inference literature to infer them.
             We model the problem of learning the decision list as a
             Markov Decision Process (MDP) and employ a variant of the
             Upper Confidence Bound for Trees (UCT) strategy which
             leverages customized checks for pruning the search space
             effectively. Experimental results on real world
             observational data capturing judicial bail decisions and
             treatment recommendations for asthma patients demonstrate
             the effectiveness of our approach.},
   Key = {fds344610}
}

@article{fds349524,
   Author = {Lakkaraju, H and Rudin, C},
   Title = {Learning cost-effective and interpretable treatment
             regimes},
   Journal = {Proceedings of the 20th International Conference on
             Artificial Intelligence and Statistics, AISTATS
             2017},
   Year = {2017},
   Month = {January},
   Abstract = {Decision makers, such as doctors and judges, make crucial
             decisions such as recommending treatments to patients, and
             granting bail to defendants on a daily basis. Such decisions
             typically involve weighing the potential benefits of taking
             an action against the costs involved. In this work, we aim
             to automate this task of learning cost-effective,
             interpretable and actionable treatment regimes. We formulate
             this as a problem of learning a decision list – a sequence
             of if-then-else rules – that maps characteristics of
             subjects (eg., diagnostic test results of patients) to
             treatments. This yields an end-to-end individualized policy
             for tests and treatments. We propose a novel objective to
             construct a decision list which maximizes outcomes for the
             population, and minimizes overall costs. Since we do not
             observe the outcomes corresponding to counterfactual
             scenarios, we use techniques from causal inference
             literature to infer them. We model the problem of learning
             the decision list as a Markov Decision Process (MDP) and
             employ a variant of the Upper Confidence Bound for Trees
             (UCT) strategy which leverages customized checks for pruning
             the search space effectively. Experimental results on real
             world observational data capturing judicial bail decisions
             and treatment recommendations for asthma patients
             demonstrate the effectiveness of our approach.},
   Key = {fds349524}
}

@article{fds330627,
   Author = {Letham, B and Letham, LM and Rudin, C},
   Title = {Bayesian inference of arrival rate and substitution behavior
             from sales transaction data with stockouts},
   Journal = {Proceedings of the ACM SIGKDD International Conference on
             Knowledge Discovery and Data Mining},
   Volume = {13-17-August-2016},
   Pages = {1695-1704},
   Publisher = {ACM Press},
   Year = {2016},
   Month = {August},
   ISBN = {9781450342322},
   url = {http://dx.doi.org/10.1145/2939672.2939810},
   Abstract = {When an item goes out of stock, sales transaction data no
             longer reflect the original customer demand, since some
             customers leave with no purchase while others substitute
             alternative products for the one that was out of stock. Here
             we develop a Bayesian hierarchical model for inferring the
             underlying customer arrival rate and choice model from sales
             transaction data and the corresponding stock levels. The
             model uses a nonhomogeneous Poisson process to allow the
             arrival rate to vary throughout the day, and allows for a
             variety of choice models. Model parameters are inferred
             using a stochastic gradient MCMC algorithm that can scale to
             large transaction databases. We fit the model to data from a
             local bakery and show that it is able to make accurate
             out-of-sample predictions, and to provide actionable insight
             into lost cookie sales.},
   Doi = {10.1145/2939672.2939810},
   Key = {fds330627}
}

@article{fds330626,
   Author = {Wang, T and Rudin, C and Velez-Doshi, F and Liu, Y and Klampfl, E and Macneille, P},
   Title = {Bayesian rule sets for interpretable classification},
   Journal = {Proceedings - IEEE International Conference on Data Mining,
             ICDM},
   Pages = {1269-1274},
   Year = {2016},
   Month = {July},
   ISBN = {9781509054725},
   url = {http://dx.doi.org/10.1109/ICDM.2016.130},
   Abstract = {A Rule Set model consists of a small number of short rules
             for interpretable classification, where an instance is
             classified as positive if it satisfies at least one of the
             rules. The rule set provides reasons for predictions, and
             also descriptions of a particular class. We present a
             Bayesian framework for learning Rule Set models, with prior
             parameters that the user can set to encourage the model to
             have a desired size and shape in order to conform with a
             domain-specific definition of interpretability. We use an
             efficient inference approach for searching for the MAP
             solution and provide theoretical bounds to reduce
             computation. We apply Rule Set models to ten UCI data sets
             and compare the performance with other interpretable and
             non-interpretable models.},
   Doi = {10.1109/ICDM.2016.130},
   Key = {fds330626}
}

@article{fds330628,
   Author = {Moghaddass, R and Rudin, C and Madigan, D},
   Title = {The factorized self-controlled case series method: An
             approach for estimating the effects of many drugs on many
             outcomes},
   Journal = {Journal of Machine Learning Research},
   Volume = {17},
   Year = {2016},
   Month = {June},
   Abstract = {We provide a hierarchical Bayesian model for estimating the
             effects of transient drug exposures on a collection of
             health outcomes, where the effects of all drugs on all
             outcomes are estimated simultaneously. The method possesses
             properties that allow it to handle important challenges of
             dealing with large-scale longitudinal observational
             databases. In particular, this model is a generalization of
             the self-controlled case series (SCCS) method, meaning that
             certain patient specific baseline rates never need to be
             estimated. Further, this model is formulated with layers of
             latent factors, which substantially reduces the number of
             parameters and helps with interpretability by illuminating
             latent classes of drugs and outcomes. We believe our work is
             the first to consider multivariate SCCS (in the sense of
             multiple outcomes) and is the first to couple latent factor
             analysis with SCCS. We demonstrate the approach by
             estimating the effects of various time-sensitive insulin
             treatments for diabetes.},
   Key = {fds330628}
}

@article{fds330629,
   Author = {Letham, B and Letham, PA and Rudin, C and Browne,
             EP},
   Title = {Prediction uncertainty and optimal experimental design for
             learning dynamical systems.},
   Journal = {Chaos (Woodbury, N.Y.)},
   Volume = {26},
   Number = {6},
   Pages = {063110},
   Year = {2016},
   Month = {June},
   url = {http://dx.doi.org/10.1063/1.4953795},
   Abstract = {Dynamical systems are frequently used to model biological
             systems. When these models are fit to data, it is necessary
             to ascertain the uncertainty in the model fit. Here, we
             present prediction deviation, a metric of uncertainty that
             determines the extent to which observed data have
             constrained the model's predictions. This is accomplished by
             solving an optimization problem that searches for a pair of
             models that each provides a good fit for the observed data,
             yet has maximally different predictions. We develop a method
             for estimating a priori the impact that additional
             experiments would have on the prediction deviation, allowing
             the experimenter to design a set of experiments that would
             most reduce uncertainty. We use prediction deviation to
             assess uncertainty in a model of interferon-alpha inhibition
             of viral infection, and to select a sequence of experiments
             that reduces this uncertainty. Finally, we prove a
             theoretical result which shows that prediction deviation
             provides bounds on the trajectories of the underlying true
             model. These results show that prediction deviation is a
             meaningful metric of uncertainty that can be used for
             optimal experimental design.},
   Doi = {10.1063/1.4953795},
   Key = {fds330629}
}

@article{fds330630,
   Author = {Souillard-Mandar, W and Davis, R and Rudin, C and Au, R and Libon, DJ and Swenson, R and Price, CC and Lamar, M and Penney,
             DL},
   Title = {Learning classification models of cognitive conditions from
             subtle behaviors in the digital Clock Drawing
             Test},
   Journal = {Machine Learning},
   Volume = {102},
   Number = {3},
   Pages = {393-441},
   Publisher = {Springer Nature},
   Year = {2016},
   Month = {March},
   url = {http://dx.doi.org/10.1007/s10994-015-5529-5},
   Abstract = {The Clock Drawing Test—a simple pencil and paper
             test—has been used for more than 50 years as a screening
             tool to differentiate normal individuals from those with
             cognitive impairment, and has proven useful in helping to
             diagnose cognitive dysfunction associated with neurological
             disorders such as Alzheimer’s disease, Parkinson’s
             disease, and other dementias and conditions. We have been
             administering the test using a digitizing ballpoint pen that
             reports its position with considerable spatial and temporal
             precision, making available far more detailed data about the
             subject’s performance. Using pen stroke data from these
             drawings categorized by our software, we designed and
             computed a large collection of features, then explored the
             tradeoffs in performance and interpretability in classifiers
             built using a number of different subsets of these features
             and a variety of different machine learning techniques. We
             used traditional machine learning methods to build
             prediction models that achieve high accuracy. We
             operationalized widely used manual scoring systems so that
             we could use them as benchmarks for our models. We worked
             with clinicians to define guidelines for model
             interpretability, and constructed sparse linear models and
             rule lists designed to be as easy to use as scoring systems
             currently used by clinicians, but more accurate. While our
             models will require additional testing for validation, they
             offer the possibility of substantial improvement in
             detecting cognitive impairment earlier than currently
             possible, a development with considerable potential impact
             in practice.},
   Doi = {10.1007/s10994-015-5529-5},
   Key = {fds330630}
}

@article{fds330631,
   Author = {Ustun, B and Rudin, C},
   Title = {Supersparse linear integer models for optimized medical
             scoring systems},
   Journal = {Machine Learning},
   Volume = {102},
   Number = {3},
   Pages = {349-391},
   Publisher = {Springer Nature},
   Year = {2016},
   Month = {March},
   url = {http://dx.doi.org/10.1007/s10994-015-5528-6},
   Abstract = {Scoring systems are linear classification models that only
             require users to add, subtract and multiply a few small
             numbers in order to make a prediction. These models are in
             widespread use by the medical community, but are difficult
             to learn from data because they need to be accurate and
             sparse, have coprime integer coefficients, and satisfy
             multiple operational constraints. We present a new method
             for creating data-driven scoring systems called a
             Supersparse Linear Integer Model (SLIM). SLIM scoring
             systems are built by using an integer programming problem
             that directly encodes measures of accuracy (the 0–1 loss)
             and sparsity (the (Formula presented.) -seminorm) while
             restricting coefficients to coprime integers. SLIM can
             seamlessly incorporate a wide range of operational
             constraints related to accuracy and sparsity, and can
             produce acceptable models without parameter tuning because
             of the direct control provided over these quantities. We
             provide bounds on the testing and training accuracy of SLIM
             scoring systems, and present a new data reduction technique
             that can improve scalability by eliminating a portion of the
             training data beforehand. Our paper includes results from a
             collaboration with the Massachusetts General Hospital Sleep
             Laboratory, where SLIM is being used to create a highly
             tailored scoring system for sleep apnea screening.},
   Doi = {10.1007/s10994-015-5528-6},
   Key = {fds330631}
}

@article{fds330632,
   Author = {Ustun, B and Westover, MB and Rudin, C and Bianchi,
             MT},
   Title = {Clinical Prediction Models for Sleep Apnea: The Importance
             of Medical History over Symptoms.},
   Journal = {Journal of clinical sleep medicine : JCSM : official
             publication of the American Academy of Sleep
             Medicine},
   Volume = {12},
   Number = {2},
   Pages = {161-168},
   Year = {2016},
   Month = {February},
   url = {http://dx.doi.org/10.5664/jcsm.5476},
   Abstract = {<h4>Study objective</h4>Obstructive sleep apnea (OSA) is a
             treatable contributor to morbidity and mortality. However,
             most patients with OSA remain undiagnosed. We used a new
             machine learning method known as SLIM (Supersparse Linear
             Integer Models) to test the hypothesis that a diagnostic
             screening tool based on routinely available medical
             information would be superior to one based solely on
             patient-reported sleep-related symptoms.<h4>Methods</h4>We
             analyzed polysomnography (PSG) and self-reported clinical
             information from 1,922 patients tested in our clinical sleep
             laboratory. We used SLIM and 7 state-of-the-art
             classification methods to produce predictive models for OSA
             screening using features from: (i) self-reported symptoms;
             (ii) self-reported medical information that could, in
             principle, be extracted from electronic health records
             (demographics, comorbidities), or (iii) both.<h4>Results</h4>For
             diagnosing OSA, we found that model performance using only
             medical history features was superior to model performance
             using symptoms alone, and similar to model performance using
             all features. Performance was similar to that reported for
             other widely used tools: sensitivity 64.2% and specificity
             77%. SLIM accuracy was similar to state-of-the-art
             classification models applied to this dataset, but with the
             benefit of full transparency, allowing for hands-on
             prediction using yes/no answers to a small number of
             clinical queries.<h4>Conclusion</h4>To predict OSA,
             variables such as age, sex, BMI, and medical history are
             superior to the symptom variables we examined for predicting
             OSA. SLIM produces an actionable clinical tool that can be
             applied to data that is routinely available in modern
             electronic health records, which may facilitate automated,
             rather than manual, OSA screening.<h4>Commentary</h4>A
             commentary on this article appears in this issue on page
             159.},
   Doi = {10.5664/jcsm.5476},
   Key = {fds330632}
}

@article{fds330633,
   Author = {Browne, EP and Letham, B and Rudin, C},
   Title = {A Computational Model of Inhibition of HIV-1 by
             Interferon-Alpha.},
   Journal = {PloS one},
   Volume = {11},
   Number = {3},
   Pages = {e0152316},
   Year = {2016},
   Month = {January},
   url = {http://dx.doi.org/10.1371/journal.pone.0152316},
   Abstract = {Type 1 interferons such as interferon-alpha (IFNα) inhibit
             replication of Human immunodeficiency virus (HIV-1) by
             upregulating the expression of genes that interfere with
             specific steps in the viral life cycle. This pathway thus
             represents a potential target for immune-based therapies
             that can alter the dynamics of host-virus interactions to
             benefit the host. To obtain a deeper mechanistic
             understanding of how IFNα impacts spreading HIV-1
             infection, we modeled the interaction of HIV-1 with CD4 T
             cells and IFNα as a dynamical system. This model was then
             tested using experimental data from a cell culture model of
             spreading HIV-1 infection. We found that a model in which
             IFNα induces reversible cellular states that block both
             early and late stages of HIV-1 infection, combined with a
             saturating rate of conversion to these states, was able to
             successfully fit the experimental dataset. Sensitivity
             analysis showed that the potency of inhibition by IFNα was
             particularly dependent on specific network parameters and
             rate constants. This model will be useful for designing new
             therapies targeting the IFNα network in HIV-1-infected
             individuals, as well as potentially serving as a template
             for understanding the interaction of IFNα with other
             viruses.},
   Doi = {10.1371/journal.pone.0152316},
   Key = {fds330633}
}

@article{fds344611,
   Author = {Garg, VK and Rudin, C and Jaakkola, T},
   Title = {CRAFT: ClusteR-specific Assorted Feature
             selecTion},
   Journal = {Proceedings of the 19th International Conference on
             Artificial Intelligence and Statistics, AISTATS
             2016},
   Pages = {305-313},
   Year = {2016},
   Month = {January},
   Abstract = {We present a hierarchical Bayesian framework for clustering
             with cluster-specific feature selection. We derive a
             simplified model, CRAFT, by analyzing the asymptotic
             behavior of the log posterior formulations in a
             nonparametric MAP-based clustering setting in this
             framework. CRAFT handles assorted data, i.e., both numeric
             and categorical data, and the underlying objective functions
             are intuitively appealing. The resulting algorithm is simple
             to implement and scales nicely, requires minimal parameter
             tuning, obviates the need to specify the number of clusters
             a priori, and compares favorably with other state-of-the-art
             methods on several datasets. We provide empirical evidence
             on carefully designed synthetic data sets to highlight the
             robustness of the algorithm to recover the underlying
             feature subspaces, even when the average dimensionality of
             the features across clusters is misspecified. Besides, the
             framework seamlessly allows for multiple views of clustering
             by interpolating between the two extremes of
             cluster-specific feature selection and global selection, and
             recovers the DP-means objective [14] under the degenerate
             setting of clustering without feature selection.},
   Key = {fds344611}
}

@article{fds330634,
   Author = {Ertekin, Ş and Rudin, C},
   Title = {A bayesian approach to learning scoring systems},
   Journal = {Big Data},
   Volume = {3},
   Number = {4},
   Pages = {267-276},
   Publisher = {MARY ANN LIEBERT, INC},
   Year = {2015},
   Month = {December},
   url = {http://dx.doi.org/10.1089/big.2015.0033},
   Abstract = {We present a Bayesian method for building scoring systems,
             which are linear models with coefficients that have very few
             significant digits. Usually the construction of scoring
             systems involve manual effort - humans invent the full
             scoring system without using data, or they choose how
             logistic regression coefficients should be scaled and
             rounded to produce a scoring system. These kinds of
             heuristics lead to suboptimal solutions. Our approach is
             different in that humans need only specify the prior over
             what the coefficients should look like, and the scoring
             system is learned from data. For this approach, we provide a
             Metropolis-Hastings sampler that tends to pull the
             coefficient values toward their "natural scale."
             Empirically, the proposed method achieves a high degree of
             interpretability of the models while maintaining competitive
             generalization performances.},
   Doi = {10.1089/big.2015.0033},
   Key = {fds330634}
}

@article{fds330635,
   Author = {Moghaddass, R and Rudin, C},
   Title = {The latent state hazard model, with application to wind
             turbine reliability},
   Journal = {Annals of Applied Statistics},
   Volume = {9},
   Number = {4},
   Pages = {1823-1863},
   Publisher = {Institute of Mathematical Statistics},
   Year = {2015},
   Month = {December},
   url = {http://dx.doi.org/10.1214/15-AOAS859},
   Abstract = {We present a new model for reliability analysis that is able
             to distinguish the latent internal vulnerability state of
             the equipment from the vulnerability caused by temporary
             external sources. Consider a wind farm where each turbine is
             running under the external effects of temperature, wind
             speed and direction, etc. The turbine might fail because of
             the external effects of a spike in temperature. If it does
             not fail during the temperature spike, it could still fail
             due to internal degradation, and the spike could cause (or
             be an indication of) this degradation. The ability to
             identify the underlying latent state can help better
             understand the effects of external sources and thus lead to
             more robust decision-making. We present an experimental
             study using SCADA sensor measurements from wind turbines in
             Italy.},
   Doi = {10.1214/15-AOAS859},
   Key = {fds330635}
}

@article{fds330637,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {Generalization bounds for learning with linear, polygonal,
             quadratic and conic side knowledge},
   Journal = {Machine Learning},
   Volume = {100},
   Number = {2-3},
   Pages = {183-216},
   Publisher = {Springer Nature},
   Year = {2015},
   Month = {September},
   url = {http://dx.doi.org/10.1007/s10994-014-5478-4},
   Abstract = {In this paper, we consider a supervised learning setting
             where side knowledge is provided about the labels of
             unlabeled examples. The side knowledge has the effect of
             reducing the hypothesis space, leading to tighter
             generalization bounds, and thus possibly better
             generalization. We consider several types of side knowledge,
             the first leading to linear and polygonal constraints on the
             hypothesis space, the second leading to quadratic
             constraints, and the last leading to conic constraints. We
             show how different types of domain knowledge can lead
             directly to these kinds of side knowledge. We prove bounds
             on complexity measures of the hypothesis space for quadratic
             and conic side knowledge, and show that these bounds are
             tight in a specific sense for the quadratic
             case.},
   Doi = {10.1007/s10994-014-5478-4},
   Key = {fds330637}
}

@article{fds330636,
   Author = {Letham, B and Rudin, C and McCormick, TH and Madigan,
             D},
   Title = {Interpretable classifiers using rules and bayesian analysis:
             Building a better stroke prediction model},
   Journal = {Annals of Applied Statistics},
   Volume = {9},
   Number = {3},
   Pages = {1350-1371},
   Publisher = {Institute of Mathematical Statistics},
   Year = {2015},
   Month = {September},
   url = {http://dx.doi.org/10.1214/15-AOAS848},
   Abstract = {We aim to produce predictive models that are not only
             accurate, but are also interpretable to human experts. Our
             models are decision lists, which consist of a series of if
             … then. . . statements (e.g., if high blood pressure, then
             stroke) that discretize a high-dimensional, multivariate
             feature space into a series of simple, readily interpretable
             decision statements. We introduce a generative model called
             Bayesian Rule Lists that yields a posterior distribution
             over possible decision lists. It employs a novel prior
             structure to encourage sparsity. Our experiments show that
             Bayesian Rule Lists has predictive accuracy on par with the
             current top algorithms for prediction in machine learning.
             Our method is motivated by recent developments in
             personalized medicine, and can be used to produce highly
             accurate and interpretable medical scoring systems. We
             demonstrate this by producing an alternative to the CHADS2
             score, actively used in clinical practice for estimating the
             risk of stroke in patients that have atrial fibrillation.
             Our model is as interpretable as CHADS2, but more
             accurate.},
   Doi = {10.1214/15-AOAS848},
   Key = {fds330636}
}

@article{fds330638,
   Author = {Wang, T and Rudin, C and Wagner, D and Sevieri, R},
   Title = {Finding Patterns with a Rotten Core: Data Mining for Crime
             Series with Cores},
   Journal = {Big Data},
   Volume = {3},
   Number = {1},
   Pages = {3-21},
   Publisher = {MARY ANN LIEBERT, INC},
   Year = {2015},
   Month = {March},
   url = {http://dx.doi.org/10.1089/big.2014.0021},
   Abstract = {One of the most challenging problems facing crime analysts
             is that of identifying crime series, which are sets of
             crimes committed by the same individual or group. Detecting
             crime series can be an important step in predictive
             policing, as knowledge of a pattern can be of paramount
             importance toward finding the offenders or stopping the
             pattern. Currently, crime analysts detect crime series
             manually; our goal is to assist them by providing automated
             tools for discovering crime series from within a database of
             crimes. Our approach relies on a key hypothesis that each
             crime series possesses at least one core of crimes that are
             very similar to each other, which can be used to
             characterize the modus operandi (M.O.) of the criminal.
             Based on this assumption, as long as we find all of the
             cores in the database, we have found a piece of each crime
             series. We propose a subspace clustering method, where the
             subspace is the M.O. of the series. The method has three
             steps: We first construct a similarity graph to link crimes
             that are generally similar, second we find cores of crime
             using an integer linear programming approach, and third we
             construct the rest of the crime series by merging cores to
             form the full crime series. To judge whether a set of crimes
             is indeed a core, we consider both pattern-general
             similarity, which can be learned from past crime series, and
             pattern-specific similarity, which is specific to the M.O.
             of the series and cannot be learned. Our method can be used
             for general pattern detection beyond crime series detection,
             as cores exist for patterns in many domains.},
   Doi = {10.1089/big.2014.0021},
   Key = {fds330638}
}

@article{fds330639,
   Author = {Ertekin, Ş and Rudin, C and McCormick, TH},
   Title = {Reactive point processes: A new approach to predicting power
             failures in underground electrical systems},
   Journal = {Annals of Applied Statistics},
   Volume = {9},
   Number = {1},
   Pages = {122-144},
   Publisher = {Institute of Mathematical Statistics},
   Year = {2015},
   Month = {January},
   url = {http://dx.doi.org/10.1214/14-AOAS789},
   Abstract = {Reactive point processes (RPPs) are a new statistical model
             designed for predicting discrete events in time based on
             past history. RPPs were developed to handle an important
             problem within the domain of electrical grid reliability:
             short-term prediction of electrical grid failures
             (“manhole events”), including outages, fires, explosions
             and smoking manholes, which can cause threats to public
             safety and reliability of electrical service in cities. RPPs
             incorporate self-exciting, self-regulating and saturating
             components. The selfexcitement occurs as a result of a past
             event, which causes a temporary rise in vulner ability to
             future events. The self-regulation occurs as a result of an
             external inspection which temporarily lowers vulnerability
             to future events. RPPs can saturate when too many events or
             inspections occur close together, which ensures that the
             probability of an event stays within a realistic range. Two
             of the operational challenges for power companies are (i)
             making continuous-time failure predictions, and (ii)
             cost/benefit analysis for decision making and proactive
             maintenance. RPPs are naturally suited for handling both of
             these challenges. We use the model to predict power-grid
             failures in Manhattan over a short-term horizon, and to
             provide a cost/benefit analysis of different proactive
             maintenance programs.},
   Doi = {10.1214/14-AOAS789},
   Key = {fds330639}
}

@article{fds330640,
   Author = {Wang, F and Rudin, C},
   Title = {Falling rule lists},
   Journal = {Journal of Machine Learning Research},
   Volume = {38},
   Pages = {1013-1022},
   Year = {2015},
   Month = {January},
   Abstract = {Falling rule lists are classification models consisting of
             an ordered list of if-then rules, where (i) the order of
             rules determines which example should be classified by each
             rule, and (ii) the estimated probability of success
             decreases monotonically down the list. These kinds of rule
             lists are inspired by healthcare applications where patients
             would be stratified into risk sets and the highest at-risk
             patients should be considered first. We provide a Bayesian
             framework for learning falling rule lists that does not rely
             on traditional greedy decision tree learning
             methods.},
   Key = {fds330640}
}

@article{fds330641,
   Author = {Rudin, C},
   Title = {Turning prediction tools into decision tools},
   Journal = {Lecture Notes in Computer Science (including subseries
             Lecture Notes in Artificial Intelligence and Lecture Notes
             in Bioinformatics)},
   Volume = {9355},
   Year = {2015},
   Month = {January},
   ISBN = {9783319244853},
   Abstract = {Arguably, the main stumbling block in getting machine
             learning algorithms used in practice is the fact that people
             do not trust them. There could be many reasons for this, for
             instance, perhaps the models are not sparse or transparent,
             or perhaps the models are not able to be customized to the
             user’s specifications as to what a decision tool should
             look like. I will discuss some recent work from the
             Prediction Analysis Lab on how to build machine learning
             models that have helpful decision-making properties. I will
             show how these models are applied to problems in healthcare
             and criminology.},
   Key = {fds330641}
}

@article{fds330647,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {Tire changes, fresh air, and yellow flags: Challenges in
             predictive analytics for professional racing},
   Journal = {Big Data},
   Volume = {2},
   Number = {2},
   Pages = {97-112},
   Publisher = {MARY ANN LIEBERT, INC},
   Year = {2014},
   Month = {June},
   url = {http://dx.doi.org/10.1089/big.2014.0018},
   Abstract = {Our goal is to design a prediction and decision system for
             real-time use during a professional car race. In designing a
             knowledge discovery process for racing, we faced several
             challenges that were overcome only when domain knowledge of
             racing was carefully infused within statistical modeling
             techniques. In this article, we describe how we leveraged
             expert knowledge of the domain to produce a real-time
             decision system for tire changes within a race. Our
             forecasts have the potential to impact how racing teams can
             optimize strategy by making tire-change decisions to benefit
             their rank position. Our work significantly expands previous
             research on sports analytics, as it is the only work on
             analytical methods for within-race prediction and decision
             making for professional car racing.},
   Doi = {10.1089/big.2014.0018},
   Key = {fds330647}
}

@article{fds330644,
   Author = {Ertekin, S and Rudin, C and Hirsh, H},
   Title = {Approximating the crowd},
   Journal = {Data Mining and Knowledge Discovery},
   Volume = {28},
   Number = {5-6},
   Pages = {1189-1221},
   Publisher = {Springer Nature},
   Year = {2014},
   Month = {January},
   url = {http://dx.doi.org/10.1007/s10618-014-0354-1},
   Abstract = {The problem of "approximating the crowd" is that of
             estimating the crowd's majority opinion by querying only a
             subset of it. Algorithms that approximate the crowd can
             intelligently stretch a limited budget for a crowdsourcing
             task. We present an algorithm, "CrowdSense," that works in
             an online fashion where items come one at a time. CrowdSense
             dynamically samples subsets of the crowd based on an
             exploration/exploitation criterion. The algorithm produces a
             weighted combination of the subset's votes that approximates
             the crowd's opinion. We then introduce two variations of
             CrowdSense that make various distributional approximations
             to handle distinct crowd characteristics. In particular, the
             first algorithm makes a statistical independence
             approximation of the labelers for large crowds, whereas the
             second algorithm finds a lower bound on how often the
             current subcrowd agrees with the crowd's majority vote. Our
             experiments on CrowdSense and several baselines demonstrate
             that we can reliably approximate the entire crowd's vote by
             collecting opinions from a representative subset of the
             crowd. © 2014 The Author(s).},
   Doi = {10.1007/s10618-014-0354-1},
   Key = {fds330644}
}

@article{fds330645,
   Author = {Kim, B and Rudin, C},
   Title = {Learning about meetings},
   Journal = {Data Mining and Knowledge Discovery},
   Volume = {28},
   Number = {5-6},
   Pages = {1134-1157},
   Publisher = {Springer Nature},
   Year = {2014},
   Month = {January},
   url = {http://dx.doi.org/10.1007/s10618-014-0348-z},
   Abstract = {Most people participate in meetings almost every day,
             multiple times a day. The study of meetings is important,
             but also challenging, as it requires an understanding of
             social signals and complex interpersonal dynamics. Our aim
             in this work is to use a data-driven approach to the science
             of meetings. We provide tentative evidence that: (i) it is
             possible to automatically detect when during the meeting a
             key decision is taking place, from analyzing only the local
             dialogue acts, (ii) there are common patterns in the way
             social dialogue acts are interspersed throughout a meeting,
             (iii) at the time key decisions are made, the amount of time
             left in the meeting can be predicted from the amount of time
             that has passed, (iv) it is often possible to predict
             whether a proposal during a meeting will be accepted or
             rejected based entirely on the language (the set of
             persuasive words) used by the speaker. © 2014 The
             Author(s).},
   Doi = {10.1007/s10618-014-0348-z},
   Key = {fds330645}
}

@article{fds330646,
   Author = {Rudin, C and Ertekin, S and Passonneau, R and Radeva, A and Tomar, A and Xie, B and Lewis, S and Riddle, M and Pangsrivinij, D and McCormick,
             T},
   Title = {Analytics for power grid distribution reliability in New
             York City},
   Journal = {Interfaces},
   Volume = {44},
   Number = {4},
   Pages = {364-382},
   Publisher = {Institute for Operations Research and the Management
             Sciences (INFORMS)},
   Year = {2014},
   Month = {January},
   url = {http://dx.doi.org/10.1287/inte.2014.0748},
   Abstract = {We summarize the first major effort to use analytics for
             preemptive maintenance and repair of an electrical
             distribution network. This is a large-scale multiyear effort
             between scientists and students at Columbia University and
             the Massachusetts Institute of Technology and engineers from
             the Consolidated Edison Company of New York (Con Edison),
             which operates the world's oldest and largest underground
             electrical system. Con Edison's preemptive maintenance
             programs are less than a decade old and are made more
             effective with the use of analytics developing alongside
             them. Some of the data we used for our projects are
             historical records dating as far back as the 1880s, and some
             of the data are free-text documents typed by Con Edison
             dispatchers. The operational goals of this work are to
             assist with Con Edison's preemptive inspection and repair
             program and its vented-cover replacement program. This has a
             continuing impact on the public safety, operating costs, and
             reliability of electrical service in New York City. © 2014
             INFORMS.},
   Doi = {10.1287/inte.2014.0748},
   Key = {fds330646}
}

@article{fds330648,
   Author = {Rudin, C and Wagstaff, KL},
   Title = {Machine learning for science and society},
   Journal = {Machine Learning},
   Volume = {95},
   Number = {1},
   Pages = {1-9},
   Publisher = {Springer Nature},
   Year = {2014},
   Month = {January},
   url = {http://dx.doi.org/10.1007/s10994-013-5425-9},
   Abstract = {The special issue on "Machine Learning for Science and
             Society" showcases machine learning work with influence on
             our current and future society. These papers address several
             key problems such as how we perform repairs on critical
             infrastructure, how we predict severe weather and aviation
             turbulence, how we conduct tax audits, whether we can detect
             privacy breaches in access to healthcare data, and how we
             link individuals across census data sets for new insights
             into population changes. In this introduction, we discuss
             the need for such a special issue within the context of our
             field and its relationship to the broader world. In the era
             of "big data," there is a need for machine learning to
             address important large-scale applied problems, yet it is
             difficult to find top venues in machine learning where such
             work is encouraged. We discuss the ramifications of this
             contradictory situation and encourage further discussion on
             the best strategy that we as a field may adopt. We also
             summarize key lessons learned from individual papers in the
             special issue so that the community as a whole can benefit.
             © 2013 The Author(s).},
   Doi = {10.1007/s10994-013-5425-9},
   Key = {fds330648}
}

@article{fds330643,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {On combining machine learning with decision
             making},
   Journal = {Machine Learning},
   Volume = {97},
   Number = {1-2},
   Pages = {33-64},
   Publisher = {Springer Nature},
   Year = {2014},
   Month = {January},
   url = {http://dx.doi.org/10.1007/s10994-014-5459-7},
   Abstract = {We present a new application and covering number bound for
             the framework of "Machine Learning with Operational Costs
             (MLOC)," which is an exploratory form of decision theory.
             The MLOC framework incorporates knowledge about how a
             predictive model will be used for a subsequent task, thus
             combining machine learning with the decision that is made
             afterwards. In this work, we use the MLOC framework to study
             a problem that has implications for power grid reliability
             and maintenance, called the Machine Learning and Traveling
             Repairman Problem (ML&TRP). The goal of the ML&TRP is to
             determine a route for a "repair crew," which repairs nodes
             on a graph. The repair crew aims to minimize the cost of
             failures at the nodes, but as in many real situations, the
             failure probabilities are not known and must be estimated.
             The MLOC framework allows us to understand how this
             uncertainty influences the repair route. We also present new
             covering number generalization bounds for the MLOC
             framework. © 2014 The Author(s).},
   Doi = {10.1007/s10994-014-5459-7},
   Key = {fds330643}
}

@article{fds330650,
   Author = {Huggins, JH and Rudin, C},
   Title = {A statistical learning theory framework for supervised
             pattern discovery},
   Journal = {SIAM International Conference on Data Mining 2014, SDM
             2014},
   Volume = {1},
   Pages = {506-514},
   Publisher = {Society for Industrial and Applied Mathematics},
   Year = {2014},
   Month = {January},
   ISBN = {9781510811515},
   url = {http://dx.doi.org/10.1137/1.9781611973440.58},
   Abstract = {This paper formalizes a latent variable inference problem we
             call supervised, pattern discovery, the goal of which is to
             find sets of observations that belong to a single "pattern."
             We discuss two versions of the problem and prove uniform
             risk bounds for both. In the first version, collections of
             patterns can be generated in an arbitrary manner and the
             data consist of multiple labeled collections. In the second
             version, the patterns are assumed to be generated
             independently by identically distributed processes. These
             processes are allowed to take an arbitrary form, so
             observations within a pattern are not in general independent
             of each other. The bounds for the second version of the
             problem are stated in terms of a new complexity measure, the
             quasi-Rademacher complexity.},
   Doi = {10.1137/1.9781611973440.58},
   Key = {fds330650}
}

@article{fds330651,
   Author = {Kim, B and Rudin, C and Shah, J},
   Title = {The Bayesian case model: A generative approach for
             case-based reasoning and prototype classification},
   Journal = {Advances in Neural Information Processing
             Systems},
   Volume = {3},
   Number = {January},
   Pages = {1952-1960},
   Year = {2014},
   Month = {January},
   Abstract = {We present the Bayesian Case Model (BCM), a general
             framework for Bayesian case-based reasoning (CBR) and
             prototype classification and clustering. BCM brings the
             intuitive power of CBR to a Bayesian generative framework.
             The BCM learns prototypes, the "quintessential" observations
             that best represent clusters in a dataset, by performing
             joint inference on cluster labels, prototypes and important
             features. Simultaneously, BCM pursues sparsity by learning
             subspaces, the sets of features that play important roles in
             the characterization of the prototypes. The prototype and
             subspace representation provides quantitative benefits in
             inter-pretability while preserving classification accuracy.
             Human subject experiments verify statistically significant
             improvements to participants' understanding when using
             explanations produced by BCM, compared to those given by
             prior art.},
   Key = {fds330651}
}

@article{fds330652,
   Author = {Goh, ST and Rudin, C},
   Title = {Box drawings for learning with imbalanced
             data},
   Journal = {Proceedings of the ACM SIGKDD International Conference on
             Knowledge Discovery and Data Mining},
   Pages = {333-342},
   Publisher = {ACM Press},
   Year = {2014},
   Month = {January},
   ISBN = {9781450329569},
   url = {http://dx.doi.org/10.1145/2623330.2623648},
   Abstract = {The vast majority of real world classification problems are
             imbalanced, meaning there are far fewer data from the class
             of interest (the positive class) than from other classes. We
             propose two machine learning algorithms to handle highly
             imbalanced classification problems. The classifiers are
             disjunctions of conjunctions, and are created as unions of
             parallel axis rectangles around the positive examples, and
             thus have the benefit of being interpretable. The first
             algorithm uses mixed integer programming to optimize a
             weighted balance between positive and negative class
             accuracies. Regularization is introduced to improve
             generalization performance. The second method uses an
             approximation in order to assist with scalability.
             Specifically, it follows a \textit{characterize then
             discriminate} approach, where the positive class is
             characterized first by boxes, and then each box boundary
             becomes a separate discriminative classifier. This method
             has the computational advantages that it can be easily
             parallelized, and considers only the relevant regions of
             feature space. © 2014 ACM.},
   Doi = {10.1145/2623330.2623648},
   Key = {fds330652}
}

@article{fds330653,
   Author = {Wang, D and Passonneau, RJ and Collins, M and Rudin,
             C},
   Title = {Modeling weather impact on a secondary electrical
             grid},
   Journal = {Procedia Computer Science},
   Volume = {32},
   Pages = {631-638},
   Publisher = {Elsevier BV},
   Year = {2014},
   Month = {January},
   url = {http://dx.doi.org/10.1016/j.procs.2014.05.470},
   Abstract = {Weather can cause problems for underground electrical grids
             by increasing the probability of serious "manhole events"
             such as fires and explosions. In this work, we compare a
             model that incorporates weather features associated with the
             dates of serious events into a single logistic regression,
             with a more complex approach that has three interdependent
             log linear models for weather, baseline manhole
             vulnerability, and vulnerability of manholes to weather. The
             latter approach more naturally incorporates the dependencies
             between the weather, structure properties, and structure
             vulnerability. © 2014 Published by Elsevier
             B.V.},
   Doi = {10.1016/j.procs.2014.05.470},
   Key = {fds330653}
}

@article{fds345400,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {Robust optimization using machine learning for uncertainty
             sets},
   Journal = {International Symposium on Artificial Intelligence and
             Mathematics, ISAIM 2014},
   Year = {2014},
   Month = {January},
   Abstract = {© 2014 University of Illinois at Chicago. All rights
             reserved. Our goal is to build robust optimization problems
             that make decisions about the future, and where complex data
             from the past are used to model uncertainty. In robust
             optimization (RO) generally, the goal is to create a policy
             for decision-making that is robust to our uncertainty about
             the future. In particular, we want our policy to best handle
             the the worst possible situation that could arise, out of an
             uncertainty set of possible situations. Classically, the
             uncertainty set is simply chosen by the user, or it might be
             estimated in overly simplistic ways with strong assumptions;
             whereas in this work, we learn the uncertainty set from
             complex data from the past. The past data are drawn randomly
             from an (unknown) possibly complicated high-dimensional
             distribution. We propose a new uncertainty set design and
             show how tools from statistical learning theory can be
             employed to provide probabilistic guarantees on the
             robustness of the policy.},
   Key = {fds345400}
}

@article{fds345401,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {Generalization bounds for learning with linear and quadratic
             side knowledge},
   Journal = {International Symposium on Artificial Intelligence and
             Mathematics, ISAIM 2014},
   Year = {2014},
   Month = {January},
   Abstract = {© 2014 University of Illinois at Chicago. All rights
             reserved. In this paper, we consider a supervised learning
             setting where side knowledge is provided about the labels of
             unlabeled examples. The side knowledge has the effect of
             reducing the hypothesis space, leading to tighter
             generalization bounds, and thus possibly better
             generalization. We consider two types of side knowledge, the
             first leading to linear constraints on the hypothesis space,
             and the second leading to quadratic constraints on the
             hypothesis space. We show how different types of domain
             knowledge can lead directly to these kinds of side
             knowledge. We prove bounds on complexity measures of the
             hypothesis space for quadratic side knowledge, and show that
             these bounds are tight in a specific sense.},
   Key = {fds345401}
}

@article{fds345402,
   Author = {Huggins, J and Rudin, C},
   Title = {Toward a theory of pattern discovery},
   Journal = {International Symposium on Artificial Intelligence and
             Mathematics, ISAIM 2014},
   Year = {2014},
   Month = {January},
   Abstract = {© 2014 University of Illinois at Chicago. All rights
             reserved. This paper formalizes a latent variable inference
             problem we call supervised pattern discovery, the goal of
             which is to find sets of observations that belong to a
             single “pattern.” We discuss two versions of the problem
             and prove uniform risk bounds for both. In the first
             version, collections of patterns can be generated in an
             arbitrary manner and the data consist of multiple labeled
             collections. In the second version, the patterns are assumed
             to be generated independently by identically distributed
             processes. These processes are allowed to take an arbitrary
             form, so observations within a pattern are not in general
             independent of each other. The bounds for the second version
             of the problem are stated in terms of a new complexity
             measure, the quasi-Rademacher complexity.},
   Key = {fds345402}
}

@article{fds349525,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {Robust optimization using machine learning for uncertainty
             sets},
   Journal = {International Symposium on Artificial Intelligence and
             Mathematics, ISAIM 2014},
   Year = {2014},
   Month = {January},
   Abstract = {Our goal is to build robust optimization problems that make
             decisions about the future, and where complex data from the
             past are used to model uncertainty. In robust optimization
             (RO) generally, the goal is to create a policy for
             decision-making that is robust to our uncertainty about the
             future. In particular, we want our policy to best handle the
             the worst possible situation that could arise, out of an
             uncertainty set of possible situations. Classically, the
             uncertainty set is simply chosen by the user, or it might be
             estimated in overly simplistic ways with strong assumptions;
             whereas in this work, we learn the uncertainty set from
             complex data from the past. The past data are drawn randomly
             from an (unknown) possibly complicated high-dimensional
             distribution. We propose a new uncertainty set design and
             show how tools from statistical learning theory can be
             employed to provide probabilistic guarantees on the
             robustness of the policy.},
   Key = {fds349525}
}

@article{fds349526,
   Author = {Huggins, J and Rudin, C},
   Title = {Toward a theory of pattern discovery},
   Journal = {International Symposium on Artificial Intelligence and
             Mathematics, ISAIM 2014},
   Year = {2014},
   Month = {January},
   Abstract = {This paper formalizes a latent variable inference problem we
             call supervised pattern discovery, the goal of which is to
             find sets of observations that belong to a single
             “pattern.” We discuss two versions of the problem and
             prove uniform risk bounds for both. In the first version,
             collections of patterns can be generated in an arbitrary
             manner and the data consist of multiple labeled collections.
             In the second version, the patterns are assumed to be
             generated independently by identically distributed
             processes. These processes are allowed to take an arbitrary
             form, so observations within a pattern are not in general
             independent of each other. The bounds for the second version
             of the problem are stated in terms of a new complexity
             measure, the quasi-Rademacher complexity.},
   Key = {fds349526}
}

@article{fds349527,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {Generalization bounds for learning with linear and quadratic
             side knowledge},
   Journal = {International Symposium on Artificial Intelligence and
             Mathematics, ISAIM 2014},
   Year = {2014},
   Month = {January},
   Abstract = {In this paper, we consider a supervised learning setting
             where side knowledge is provided about the labels of
             unlabeled examples. The side knowledge has the effect of
             reducing the hypothesis space, leading to tighter
             generalization bounds, and thus possibly better
             generalization. We consider two types of side knowledge, the
             first leading to linear constraints on the hypothesis space,
             and the second leading to quadratic constraints on the
             hypothesis space. We show how different types of domain
             knowledge can lead directly to these kinds of side
             knowledge. We prove bounds on complexity measures of the
             hypothesis space for quadratic side knowledge, and show that
             these bounds are tight in a specific sense.},
   Key = {fds349527}
}

@article{fds330654,
   Author = {Letham, B and Rudin, C and Heller, KA},
   Title = {Growing a list},
   Journal = {Data Mining and Knowledge Discovery},
   Volume = {27},
   Number = {3},
   Pages = {372-395},
   Year = {2013},
   Month = {December},
   url = {http://dx.doi.org/10.1007/s10618-013-0329-7},
   Abstract = {It is easy to find expert knowledge on the Internet on
             almost any topic, but obtaining a complete overview of a
             given topic is not always easy: information can be scattered
             across many sources and must be aggregated to be useful. We
             introduce a method for intelligently growing a list of
             relevant items, starting from a small seed of examples. Our
             algorithm takes advantage of the wisdom of the crowd, in the
             sense that there are many experts who post lists of things
             on the Internet. We use a collection of simple machine
             learning components to find these experts and aggregate
             their lists to produce a single complete and meaningful
             list. We use experiments with gold standards and open-ended
             experiments without gold standards to show that our method
             significantly outperforms the state of the art. Our method
             uses the ranking algorithm Bayesian Sets even when its
             underlying independence assumption is violated, and we
             provide a theoretical generalization bound to motivate its
             use. © 2013 The Author(s).},
   Doi = {10.1007/s10618-013-0329-7},
   Key = {fds330654}
}

@article{fds330655,
   Author = {Rudin, C and Letham, B and Madigan, D},
   Title = {Learning theory analysis for association rules and
             sequential event prediction},
   Journal = {Journal of Machine Learning Research},
   Volume = {14},
   Pages = {3441-3492},
   Year = {2013},
   Month = {November},
   Abstract = {We present a theoretical analysis for prediction algorithms
             based on association rules. As part of this analysis, we
             introduce a problem for which rules are particularly
             natural, called "sequential event prediction." In sequential
             event prediction, events in a sequence are revealed one by
             one, and the goal is to determine which event will next be
             revealed. The training set is a collection of past sequences
             of events. An example application is to predict which item
             will next be placed into a customer's online shopping cart,
             given his/her past purchases. In the context of this
             problem, algorithms based on association rules have distinct
             advantages over classical statistical and machine learning
             methods: they look at correlations based on subsets of
             co-occurring past events (items a and b imply item c), they
             can be applied to the sequential event prediction problem in
             a natural way, they can potentially handle the "cold start"
             problem where the training set is small, and they yield
             interpretable predictions. In this work, we present two
             algorithms that incorporate association rules. These
             algorithms can be used both for sequential event prediction
             and for supervised classification, and they are simple
             enough that they can possibly be understood by users,
             customers, patients, managers, etc. We provide
             generalization guarantees on these algorithms based on
             algorithmic stability analysis from statistical learning
             theory. We include a discussion of the strict minimum
             support threshold often used in association rule mining, and
             introduce an "adjusted confidence" measure that provides a
             weaker minimum support condition that has advantages over
             the strict minimum support. The paper brings together ideas
             from statistical learning theory, association rule mining
             and Bayesian analysis. © 2013 Cynthia Rudin, Benjamin
             Letham and David Madigan.},
   Key = {fds330655}
}

@article{fds330656,
   Author = {Letham, B and Rudin, C and Madigan, D},
   Title = {Sequential event prediction},
   Journal = {Machine Learning},
   Volume = {93},
   Number = {2-3},
   Pages = {357-380},
   Publisher = {Springer Nature},
   Year = {2013},
   Month = {November},
   url = {http://dx.doi.org/10.1007/s10994-013-5356-5},
   Abstract = {In sequential event prediction, we are given a "sequence
             database" of past event sequences to learn from, and we aim
             to predict the next event within a current event sequence.
             We focus on applications where the set of the past events
             has predictive power and not the specific order of those
             past events. Such applications arise in recommender systems,
             equipment maintenance, medical informatics, and in other
             domains. Our formalization of sequential event prediction
             draws on ideas from supervised ranking. We show how specific
             choices within this approach lead to different sequential
             event prediction problems and algorithms. In recommender
             system applications, the observed sequence of events depends
             on user choices, which may be influenced by the
             recommendations, which are themselves tailored to the user's
             choices. This leads to sequential event prediction
             algorithms involving a non-convex optimization problem. We
             apply our approach to an online grocery store recommender
             system, email recipient recommendation, and a novel
             application in the health event prediction domain. © 2013
             The Author(s).},
   Doi = {10.1007/s10994-013-5356-5},
   Key = {fds330656}
}

@article{fds330658,
   Author = {Wang, T and Rudin, C and Wagner, D and Sevieri, R},
   Title = {Learning to detect patterns of crime},
   Journal = {Lecture Notes in Computer Science (including subseries
             Lecture Notes in Artificial Intelligence and Lecture Notes
             in Bioinformatics)},
   Volume = {8190 LNAI},
   Number = {PART 3},
   Pages = {515-530},
   Publisher = {Springer Berlin Heidelberg},
   Year = {2013},
   Month = {October},
   ISBN = {9783642409936},
   url = {http://dx.doi.org/10.1007/978-3-642-40994-3_33},
   Abstract = {Our goal is to automatically detect patterns of crime. Among
             a large set of crimes that happen every year in a major
             city, it is challenging, time-consuming, and labor-intensive
             for crime analysts to determine which ones may have been
             committed by the same individual(s). If automated,
             data-driven tools for crime pattern detection are made
             available to assist analysts, these tools could help police
             to better understand patterns of crime, leading to more
             precise attribution of past crimes, and the apprehension of
             suspects. To do this, we propose a pattern detection
             algorithm called Series Finder, that grows a pattern of
             discovered crimes from within a database, starting from a
             "seed" of a few crimes. Series Finder incorporates both the
             common characteristics of all patterns and the unique
             aspects of each specific pattern, and has had promising
             results on a decade's worth of crime pattern data collected
             by the Crime Analysis Unit of the Cambridge Police
             Department. © 2013 Springer-Verlag.},
   Doi = {10.1007/978-3-642-40994-3_33},
   Key = {fds330658}
}

@article{fds330659,
   Author = {Mukherjee, I and Rudin, C and Schapire, RE},
   Title = {The rate of convergence of AdaBoost},
   Journal = {Journal of Machine Learning Research},
   Volume = {14},
   Pages = {2315-2347},
   Year = {2013},
   Month = {August},
   Abstract = {The AdaBoost algorithm was designed to combine many "weak"
             hypotheses that perform slightly better than random guessing
             into a "strong" hypothesis that has very low error. We study
             the rate at which AdaBoost iteratively converges to the
             minimum of the "exponential loss." Unlike previous work, our
             proofs do not require a weak-learning assumption, nor do
             they require that minimizers of the exponential loss are
             finite. Our first result shows that the exponential loss of
             AdaBoost's computed parameter vector will be at most e more
             than that of any parameter vector of l1-norm bounded by B in
             a number of rounds that is at most a polynomial in B and
             1/ε. We also provide lower bounds showing that a polynomial
             dependence is necessary. Our second result is that within
             C/ε iterations, AdaBoost achieves a value of the
             exponential loss that is at most e more than the best
             possible value, where C depends on the data set. We show
             that this dependence of the rate on ε is optimal up to
             constant factors, that is, at least Ω(1/ε) rounds are
             necessary to achieve within e of the optimal exponential
             loss. © 2013 Indraneel Mukherjee, Cynthia Rudin and Robert
             E. Schapire.},
   Key = {fds330659}
}

@article{fds330660,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {Machine learning with operational costs},
   Journal = {Journal of Machine Learning Research},
   Volume = {14},
   Pages = {1989-2028},
   Year = {2013},
   Month = {June},
   Abstract = {This work proposes a way to align statistical modeling with
             decision making. We provide a method that propagates the
             uncertainty in predictive modeling to the uncertainty in
             operational cost, where operational cost is the amount spent
             by the practitioner in solving the problem. The method
             allows us to explore the range of operational costs
             associated with the set of reasonable statistical models, so
             as to provide a useful way for practitioners to understand
             uncertainty. To do this, the operational cost is cast as a
             regularization term in a learning algorithm's objective
             function, allowing either an optimistic or pessimistic view
             of possible costs, depending on the regularization
             parameter. From another perspective, if we have prior
             knowledge about the operational cost, for instance that it
             should be low, this knowledge can help to restrict the
             hypothesis space, and can help with generalization. We
             provide a theoretical generalization bound for this
             scenario. We also show that learning with operational costs
             is related to robust optimization. © 2013 Theja
             Tulabandhula and Cynthia Rudin.},
   Key = {fds330660}
}

@article{fds330661,
   Author = {Ertekin, S and Rudin, C and McCormick, TH},
   Title = {Predicting power failures with reactive point
             processes},
   Journal = {AAAI Workshop - Technical Report},
   Volume = {WS-13-17},
   Pages = {23-25},
   Year = {2013},
   Month = {January},
   ISBN = {9781577356288},
   Key = {fds330661}
}

@article{fds330662,
   Author = {Kim, B and Rudin, C},
   Title = {Machine learning for meeting analysis},
   Journal = {AAAI Workshop - Technical Report},
   Volume = {WS-13-17},
   Pages = {59-61},
   Year = {2013},
   Month = {January},
   ISBN = {9781577356288},
   Abstract = {Most people participate in meetings almost every day,
             multiple times a day. The study of meetings is important,
             but also challenging, as it requires an understanding of
             social signals and complex interpersonal dynamics. Our aim
             this work is to use a data-driven approach to the science of
             meetings. We provide tentative evidence that: i) there are
             common macro-patterns in the way social dialogue acts are
             interspersed throughout a meeting, and ii) it is often
             possible to predict whether a proposal during a meeting will
             be accepted or rejected based entirely on the language (the
             set of persuasive words) used by the speaker. Copyright ©
             2013, Association for the Advancement of Artificial
             Intelligence. All rights reserved.},
   Key = {fds330662}
}

@article{fds330663,
   Author = {Letham, B and Rudin, C and McCormick, TH and Madigan,
             D},
   Title = {An interpretable stroke prediction model using rules and
             Bayesian analysis},
   Journal = {AAAI Workshop - Technical Report},
   Volume = {WS-13-17},
   Pages = {65-67},
   Year = {2013},
   Month = {January},
   ISBN = {9781577356288},
   Abstract = {We aim to produce predictive models that are not only
             accurate, but are also interpretable to human experts. We
             introduce a generative model called the Bayesian List
             Machine for fitting decision lists, a type of interpretable
             classifier, to data. We use the model to predict stroke in
             atrial fibrillation patients, and produce predictive models
             that are simple enough to be understood by patients yet
             significantly outperform the medical scoring systems
             currently in use.},
   Key = {fds330663}
}

@article{fds330664,
   Author = {Wang, T and Rudin, C and Wagner, D and Sevieri, R},
   Title = {Detecting patterns of crime with Series Finder},
   Journal = {AAAI Workshop - Technical Report},
   Volume = {WS-13-17},
   Pages = {140-142},
   Year = {2013},
   Month = {January},
   ISBN = {9781577356288},
   Abstract = {Many crimes can happen every day in a major city, and
             figuring out which ones are committed by the same individual
             or group is an important and difficult data mining
             challenge. To do this, we propose a pattern detection
             algorithm called Series Finder, that grows a pattern of
             discovered crimes from within a database, starting from a
             "seed" of a few crimes. Series Finder incorporates both the
             common characteristics of all patterns and the unique
             aspects of each specific pattern. We compared Series Finder
             with classic clustering and classification models applied to
             crime analysis. It has promising results on a decade's worth
             of crime pattern data from the Cambridge Police Department.
             Copyright © 2013, Association for the Advancement of
             Artificial Intelligence. All rights reserved.},
   Key = {fds330664}
}

@article{fds330665,
   Author = {Ustun, B and Tracà, S and Rudin, C},
   Title = {Supersparse linear integer models for predictive scoring
             systems},
   Journal = {AAAI Workshop - Technical Report},
   Volume = {WS-13-17},
   Pages = {128-130},
   Year = {2013},
   Month = {January},
   ISBN = {9781577356288},
   Key = {fds330665}
}

@article{fds330666,
   Author = {Tulabandhula, T and Rudin, C},
   Title = {The influence of operational cost on estimation},
   Journal = {International Symposium on Artificial Intelligence and
             Mathematics, ISAIM 2012},
   Year = {2012},
   Month = {December},
   Abstract = {This work concerns the way that statistical models are used
             to make decisions. In particular, we aim to merge the way
             estimation algorithms are designed with how they are used
             for a subsequent task. Our methodology considers the
             operational cost of carrying out a policy, based on a
             predictive model. The operational cost becomes a
             regularization term in the learning algorithm's objective
             function, allowing either an optimistic or pessimistic view
             of possible costs. Limiting the operational cost reduces the
             hypothesis space for the predictive model, and can thus
             improve generalization. We show that different types of
             operational problems can lead to the same type of
             restriction on the hypothesis space, namely the restriction
             to an intersection of an ℓq ball with a halfspace. We
             bound the complexity of such hypothesis spaces by proposing
             a technique that involves counting integer points in
             polyhedrons.},
   Key = {fds330666}
}

@article{fds330667,
   Author = {Bertsimas, D and Chang, A and Rudin, C},
   Title = {An integer optimization approach to associative
             classification},
   Journal = {Advances in Neural Information Processing
             Systems},
   Volume = {4},
   Pages = {3302-3310},
   Year = {2012},
   Month = {December},
   ISBN = {9781627480031},
   Abstract = {We aim to design classifiers that have the interpretability
             of association rules yet have predictive power on par with
             the top machine learning algorithms for classification. We
             propose a novel mixed integer optimization (MIO) approach
             called Ordered Rules for Classification (ORC) for this task.
             Our method has two parts. The first part mines a particular
             frontier of solutions in the space of rules, and we show
             that this frontier contains the best rules according to a
             variety of interestingness measures. The second part learns
             an optimal ranking for the rules to build a decision list
             classifier that is simple and insightful. We report
             empirical evidence using several different datasets to
             demonstrate the performance of this method.},
   Key = {fds330667}
}

@article{fds330668,
   Author = {Ertekin, S and Hirsh, H and Rudin, C},
   Title = {Selective sampling of labelers for approximating the
             crowd},
   Journal = {AAAI Fall Symposium - Technical Report},
   Volume = {FS-12-06},
   Pages = {7-13},
   Year = {2012},
   Month = {December},
   Abstract = {In this paper, we present CrowdSense, an algorithm for
             estimating the crowd's majority opinion by querying only a
             subset of it. CrowdSense works in an online fashion where
             examples come one at a time and it dynamically samples
             subsets of labelers based on an exploration/exploitation
             criterion. The algorithm produces a weighted combination of
             a subset of the labelers' votes that approximates the
             crowd's opinion. We also present two probabilistic variants
             of CrowdSense mat are based on different assumptions on the
             joint probability distribution between the labelers' votes
             and the majority vote. Our experiments demonstrate that we
             can reliably approximate the entire crowd's vote by
             collecting opinions from a representative subset of the
             crowd. Copyright © 2012, Association for the Advancement of
             Artificial Intelligence (www.aaai.org). All rights
             reserved.},
   Key = {fds330668}
}

@article{fds330669,
   Author = {Chang, A and Rudin, C and Cavaretta, M and Robert Thomas, and Gloria
             Chou},
   Title = {How to reverse-engineer quality rankings},
   Journal = {Machine Learning},
   Volume = {88},
   Number = {3},
   Pages = {369-398},
   Publisher = {Springer Nature},
   Year = {2012},
   Month = {September},
   url = {http://dx.doi.org/10.1007/s10994-012-5295-6},
   Abstract = {A good or bad product quality rating can make or break an
             organization. However, the notion of "quality" is often
             defined by an independent rating company that does not make
             the formula for determining the rank of a product publicly
             available. In order to invest wisely in product development,
             organizations are starting to use intelligent approaches for
             determining how funding for product development should be
             allocated. A critical step in this process is to
             "reverse-engineer" a rating company's proprietary model as
             closely as possible. In this work, we provide a machine
             learning approach for this task, which optimizes a certain
             rank statistic that encodes preference information specific
             to quality rating data. We present experiments on data from
             a major quality rating company, and provide new methods for
             evaluating the solution. In addition, we provide an approach
             to use the reverse-engineered model to achieve a top ranked
             product in a cost-effective way. © The Author(s)
             2012.},
   Doi = {10.1007/s10994-012-5295-6},
   Key = {fds330669}
}

@article{fds330670,
   Author = {McCormick, TH and Rudin, C and Madigan, D},
   Title = {Bayesian hierarchical rule modeling for predicting medical
             conditions},
   Journal = {Annals of Applied Statistics},
   Volume = {6},
   Number = {2},
   Pages = {652-668},
   Publisher = {Institute of Mathematical Statistics},
   Year = {2012},
   Month = {June},
   url = {http://dx.doi.org/10.1214/11-AOAS522},
   Abstract = {We propose a statistical modeling technique, called the
             Hierarchical Association Rule Model (HARM), that predicts a
             patient's possible future medical conditions given the
             patient's current and past history of reported conditions.
             The core of our technique is a Bayesian hierarchical model
             for selecting predictive association rules (such as
             "condition 1 and condition 2 → condition 3") from a large
             set of candidate rules. Because this method "borrows
             strength" using the conditions of many similar patients, it
             is able to provide predictions specialized to any given
             patient, even when little information about the patient's
             history of conditions is available. © Institute of
             Mathematical Statistics, 2012.},
   Doi = {10.1214/11-AOAS522},
   Key = {fds330670}
}

@article{fds330671,
   Author = {Rudin, C and Waltz, D and Anderson, R and Boulanger, A and Salleb-Aouissi, A and Chow, M and Dutta, H and Gross, P and Huang, B and Ierome, S and Isaac, DF and Kressner, A and Passonneau, RJ and Radeva,
             A and Wu, L},
   Title = {Machine learning for the New York City power
             grid},
   Journal = {IEEE Transactions on Pattern Analysis and Machine
             Intelligence},
   Volume = {34},
   Number = {2},
   Pages = {328-345},
   Publisher = {Institute of Electrical and Electronics Engineers
             (IEEE)},
   Year = {2012},
   Month = {January},
   url = {http://dx.doi.org/10.1109/TPAMI.2011.108},
   Abstract = {Power companies can benefit from the use of knowledge
             discovery methods and statistical machine learning for
             preventive maintenance. We introduce a general process for
             transforming historical electrical grid data into models
             that aim to predict the risk of failures for components and
             systems. These models can be used directly by power
             companies to assist with prioritization of maintenance and
             repair work. Specialized versions of this process are used
             to produce 1) feeder failure rankings, 2) cable, joint,
             terminator, and transformer rankings, 3) feeder Mean Time
             Between Failure (MTBF) estimates, and 4) manhole events
             vulnerability rankings. The process in its most general form
             can handle diverse, noisy, sources that are historical
             (static), semi-real-time, or real-time, incorporates
             state-of-the-art machine learning algorithms for
             prioritization (supervised ranking or MTBF), and includes an
             evaluation of results via cross-validation and blind test.
             Above and beyond the ranked lists and MTBF estimates are
             business management interfaces that allow the prediction
             capability to be integrated directly into corporate planning
             and decision support; such interfaces rely on several
             important properties of our general modeling approach: that
             machine learning features are meaningful to domain experts,
             that the processing of data is transparent, and that
             prediction results are accurate enough to support sound
             decision making. We discuss the challenges in working with
             historical electrical grid data that were not designed for
             predictive purposes. The rawness of these data contrasts
             with the accuracy of the statistical models that can be
             obtained from the process; these models are sufficiently
             accurate to assist in maintaining New York City's electrical
             grid. © 2011 IEEE.},
   Doi = {10.1109/TPAMI.2011.108},
   Key = {fds330671}
}

@article{fds330672,
   Author = {Tulabandhula, T and Rudin, C and Jaillet, P},
   Title = {The machine learning and traveling repairman
             problem},
   Journal = {Lecture Notes in Computer Science (including subseries
             Lecture Notes in Artificial Intelligence and Lecture Notes
             in Bioinformatics)},
   Volume = {6992 LNAI},
   Pages = {262-276},
   Publisher = {Springer Berlin Heidelberg},
   Year = {2011},
   Month = {October},
   ISBN = {9783642248726},
   url = {http://dx.doi.org/10.1007/978-3-642-24873-3_20},
   Abstract = {The goal of the Machine Learning and Traveling Repairman
             Problem (ML&TRP) is to determine a route for a "repair
             crew," which repairs nodes on a graph. The repair crew aims
             to minimize the cost of failures at the nodes, but the
             failure probabilities are not known and must be estimated.
             If there is uncertainty in the failure probability
             estimates, we take this uncertainty into account in an
             unusual way; from the set of acceptable models, we choose
             the model that has the lowest cost of applying it to the
             subsequent routing task. In a sense, this procedure agrees
             with a managerial goal, which is to show that the data can
             support choosing a low-cost solution. © 2011
             Springer-Verlag.},
   Doi = {10.1007/978-3-642-24873-3_20},
   Key = {fds330672}
}

@article{fds330673,
   Author = {Ertekin, S and Rudin, C},
   Title = {On equivalence relationships between classification and
             ranking algorithms},
   Journal = {Journal of Machine Learning Research},
   Volume = {12},
   Pages = {2905-2929},
   Year = {2011},
   Month = {October},
   Abstract = {We demonstrate that there are machine learning algorithms
             that can achieve success for two separate tasks
             simultaneously, namely the tasks of classification and
             bipartite ranking. This means that advantages gained from
             solving one task can be carried over to the other task, such
             as the ability to obtain conditional density estimates, and
             an order-of-magnitude reduction in computational time for
             training the algorithm. It also means that some algorithms
             are robust to the choice of evaluation metric used; they can
             theoretically perform well when performance is measured
             either by a misclassification error or by a statistic of the
             ROC curve (such as the area under the curve). Specifically,
             we provide such an equivalence relationship between a
             generalization of Freund et al.'s RankBoost algorithm,
             called the "P-Norm Push," and a particular cost-sensitive
             classification algorithm that generalizes AdaBoost, which we
             call "P-Classification." We discuss and validate the
             potential benefits of this equivalence relationship, and
             perform controlled experiments to understand
             P-Classification's empirical performance. There is no
             established equivalence relationship for logistic regression
             and its ranking counterpart, so we introduce a
             logistic-regression-style algorithm that aims in between
             classification and ranking, and has promising experimental
             performance with respect to both tasks. © 2011 Şeyda
             Ertekin and Cynthia Rudin.},
   Key = {fds330673}
}

@article{fds330674,
   Author = {Wu, L and Kaiser, G and Rudin, C and Anderson, R},
   Title = {Data quality assurance and performance measurement of data
             mining for preventive maintenance of power
             grid},
   Journal = {Proceedings of the 1st International Workshop on Data Mining
             for Service and Maintenance, KDD4Service 2011 - Held in
             Conjunction with SIGKDD'11},
   Pages = {28-32},
   Publisher = {ACM Press},
   Year = {2011},
   Month = {September},
   ISBN = {9781450308427},
   url = {http://dx.doi.org/10.1145/2018673.2018679},
   Abstract = {Ensuring reliability as the electrical grid morphs into the
             "smart grid" will require innovations in how we assess the
             state of the grid, for the purpose of proactive maintenance,
             rather than reactive maintenance; in the future, we will not
             only react to failures, but also try to anticipate and avoid
             them using predictive modeling (machine learning and data
             mining) techniques. To help in meeting this challenge, we
             present the Neutral Online Visualization-aided Autonomic
             evaluation framework (NOVA) for evaluating machine learning
             and data mining algorithms for preventive maintenance on the
             electrical grid. NOVA has three stages provided through a
             unified user interface: evaluation of input data quality,
             evaluation of machine learning and data mining results, and
             evaluation of the reliability improvement of the power grid.
             A prototype version of NOVA has been deployed for the power
             grid in New York City, and it is able to evaluate machine
             learning and data mining systems effectively and
             efficiently. Copyright 2011 ACM.},
   Doi = {10.1145/2018673.2018679},
   Key = {fds330674}
}

@article{fds330675,
   Author = {Wu, L and Teravainen, T and Kaiser, G and Anderson, R and Boulanger, A and Rudin, C},
   Title = {Estimation of system reliability using a semiparametric
             model},
   Journal = {IEEE 2011 EnergyTech, ENERGYTECH 2011},
   Publisher = {IEEE},
   Year = {2011},
   Month = {August},
   ISBN = {9781457707773},
   url = {http://dx.doi.org/10.1109/EnergyTech.2011.5948537},
   Abstract = {An important problem in reliability engineering is to
             predict the failure rate, that is, the frequency with which
             an engineered system or component fails. This paper presents
             a new method of estimating failure rate using a
             semiparametric model with Gaussian process smoothing. The
             method is able to provide accurate estimation based on
             historical data and it does not make strong a priori
             assumptions of failure rate pattern (e.g., constant or
             monotonic). Our experiments of applying this method in power
             system failure data compared with other models show its
             efficacy and accuracy. This method can be used in estimating
             reliability for many other systems, such as software systems
             or components. © 2011 IEEE.},
   Doi = {10.1109/EnergyTech.2011.5948537},
   Key = {fds330675}
}

@article{fds330676,
   Author = {Rudin, C and Letham, B and Kogan, E and Madigan, D},
   Title = {A Learning Theory Framework for Association Rules and
             Sequential Events},
   Year = {2011},
   Month = {June},
   Key = {fds330676}
}

@article{fds330677,
   Author = {Rudin, C and Passonneau, RJ and Radeva, A and Ierome, S and Isaac,
             DF},
   Title = {21st-century data miners meet 19th-century electrical
             cables},
   Journal = {Computer},
   Volume = {44},
   Number = {6},
   Pages = {103-105},
   Publisher = {Institute of Electrical and Electronics Engineers
             (IEEE)},
   Year = {2011},
   Month = {June},
   url = {http://dx.doi.org/10.1109/MC.2011.164},
   Abstract = {Researchers can repurpose even extremely raw historical data
             for use in prediction. © 2006 IEEE.},
   Doi = {10.1109/MC.2011.164},
   Key = {fds330677}
}

@article{fds330678,
   Author = {McCormick, T and Rudin, C and Madigan, D},
   Title = {A Hierarchical Model for Association Rule Mining of
             Sequential Events: An Approach to Automated Medical Symptom
             Prediction},
   Year = {2011},
   Month = {January},
   Key = {fds330678}
}

@article{fds330679,
   Author = {Rudin, C and Letham, B and Salleb-Aouissi, A and Kogan, E and Madigan,
             D},
   Title = {Sequential event prediction with association
             rules},
   Journal = {Journal of Machine Learning Research},
   Volume = {19},
   Pages = {615-634},
   Year = {2011},
   Month = {January},
   Abstract = {We consider a supervised learning problem in which data are
             revealed sequentially and the goal is to determine what will
             next be revealed. In the context of this problem, algorithms
             based on association rules have a distinct advantage over
             classical statistical and machine learning methods; however,
             there has not previously been a theoretical foundation
             established for using association rules in supervised
             learning. We present two simple algorithms that incorporate
             association rules, and provide generalization guarantees on
             these algorithms based on algorithmic stability analysis
             from statistical learning theory. We include a discussion of
             the strict minimum support threshold often used in
             association rule mining, and introduce an "adjusted
             confidence" measure that provides a weaker minimum support
             condition that has advantages over the strict minimum
             support. The paper brings together ideas from statistical
             learning theory, association rule mining and Bayesian
             analysis. © 2011 C. Rudin, B. Letham, A. Salleb-Aouissi, E.
             Kogan & D. Madigan.},
   Key = {fds330679}
}

@article{fds330680,
   Author = {Mukherjee, I and Rudin, C and Schapire, RE},
   Title = {The rate of convergence of AdaBoost},
   Journal = {Journal of Machine Learning Research},
   Volume = {19},
   Pages = {537-557},
   Year = {2011},
   Month = {January},
   Abstract = {The AdaBoost algorithm of Freund and Schapire (1997) was
             designed to combine many "weak" hypotheses that perform
             slightly better than a random guess into a "strong"
             hypo-thesis that has very low error. We study the rate at
             which AdaBoost iteratively converges to the minimum of the
             "exponential loss" with a fast rate of convergence. Our
             proofs do not require a weak-learning assumption, nor do
             they require that minimizers of the exponential loss are
             finite. Specifically, our first result shows that at
             iteration t, the exponential loss of AdaBoost 's computed
             parameter vector will be at most ε more than that of any
             parameter vector of ℓ1- norm bounded by B in a number of
             rounds that is bounded by a polynomial in B and 1/ε. We
             also provide rate lower bound examples showing a polynomial
             dependence on these parameters is necessary. Our second
             result is that within C/ε iterations, AdaBoost achieves a
             value of the exponential loss that is at most ε more than
             the best possible value, where C depends on the dataset. We
             show that this dependence of the rate on ε is optimal up to
             constant factors, i.e. at least Ω( 1/ε) rounds are
             necessary to achieve within ε of the optimal exponential
             loss. © 2011 I. Mukherjee, C. Rudin & R.E.
             Schapire.},
   Key = {fds330680}
}

@article{fds330681,
   Author = {Rudin, C and Passonneau, RJ and Radeva, A and Dutta, H and Ierome, S and Isaac, D},
   Title = {A process for predicting manhole events in
             Manhattan},
   Journal = {Machine Learning},
   Volume = {80},
   Number = {1},
   Pages = {1-31},
   Publisher = {Springer Nature},
   Year = {2010},
   Month = {July},
   url = {http://dx.doi.org/10.1007/s10994-009-5166-y},
   Abstract = {We present a knowledge discovery and data mining process
             developed as part of the Columbia/Con Edison project on
             manhole event prediction. This process can assist with
             real-world prioritization problems that involve raw data in
             the form of noisy documents requiring significant amounts of
             pre-processing. The documents are linked to a set of
             instances to be ranked according to prediction criteria. In
             the case of manhole event prediction, which is a new
             application for machine learning, the goal is to rank the
             electrical grid structures in Manhattan (manholes and
             service boxes) according to their vulnerability to serious
             manhole events such as fires, explosions and smoking
             manholes. Our ranking results are currently being used to
             help prioritize repair work on the Manhattan electrical
             grid. © 2010 The Author(s).},
   Doi = {10.1007/s10994-009-5166-y},
   Key = {fds330681}
}

@article{fds330682,
   Author = {Pelossof, R and Jones, M and Vovsha, I and Rudin,
             C},
   Title = {Online coordinate boosting},
   Journal = {2009 IEEE 12th International Conference on Computer Vision
             Workshops, ICCV Workshops 2009},
   Pages = {1354-1361},
   Year = {2009},
   Month = {December},
   ISBN = {9781424444427},
   url = {http://dx.doi.org/10.1109/ICCVW.2009.5457454},
   Abstract = {We present a new online boosting algorithm for updating the
             weights of a boosted classifier, which yields a closer
             approximation to the edges found by Freund and Schapire's
             AdaBoost algorithm than previous online boosting algorithms.
             We contribute a new way of deriving the online algorithm
             that ties together previous online boosting work. The online
             algorithm is derived by minimizing AdaBoost's loss as a
             single example is added to the training set. The equations
             show that the optimization is computationally expensive.
             However, a fast online approximation is possible. We compare
             approximation error to edges found by batch AdaBoost on
             synthetic datasets and generalization error on face datasets
             and the MNIST dataset. ©2009 IEEE.},
   Doi = {10.1109/ICCVW.2009.5457454},
   Key = {fds330682}
}

@article{fds330683,
   Author = {Radeva, A and Rudin, C and Passonneau, R and Isaac,
             D},
   Title = {Report cards for manholes: Eliciting expert feedback for a
             learning task},
   Journal = {8th International Conference on Machine Learning and
             Applications, ICMLA 2009},
   Pages = {719-724},
   Publisher = {IEEE},
   Year = {2009},
   Month = {December},
   ISBN = {9780769539263},
   url = {http://dx.doi.org/10.1109/ICMLA.2009.72},
   Abstract = {We present a manhole profiling tool, developed as part of
             the Columbia/Con Edison machine learning project on manhole
             event prediction, and discuss its role in evaluating our
             machine learning model in three important ways: elimination
             of outliers, elimination of falsely predictive features, and
             assessment of the quality of the model. The model produces a
             ranked list of tens of thousands of manholes in Manhattan,
             where the ranking criterion is vulnerability to serious
             events such as fires, explosions and smoking manholes. Con
             Edison set two goals for the model, namely accuracy and
             intuitiveness, and this tool made it possible for us to
             address both of these goals. The tool automatically
             assembles a "report card or "profile" highlighting data
             associated with a given manhole. Prior to the processing
             work that underlies the profiling tool, case studies of a
             single manhole took several days and resulted in an
             incomplete study; locating manholes such as those we present
             in this work would have been extremely difficult. The model
             is currently assisting Con Edison in determining repair
             priorities for the secondary electrical grid. © 2009
             IEEE.},
   Doi = {10.1109/ICMLA.2009.72},
   Key = {fds330683}
}

@article{fds330684,
   Author = {Rudin, C and Schapire, RE},
   Title = {Margin-based ranking and an equivalence between AdaBoost and
             RankBoost},
   Journal = {Journal of Machine Learning Research},
   Volume = {10},
   Pages = {2193-2232},
   Year = {2009},
   Month = {November},
   Abstract = {We study boosting algorithms for learning to rank. We give a
             general margin-based bound for ranking based on covering
             numbers for the hypothesis space. Our bound suggests that
             algorithms that maximize the ranking margin will generalize
             well. We then describe a new algorithm, smooth margin
             ranking, that precisely converges to a maximum
             ranking-margin solution. The algorithm is a modification of
             RankBoost, analogous to "approximate coordinate ascent
             boosting." Finally, we prove that AdaBoost and RankBoost are
             equally good for the problems of bipartite ranking and
             classification in terms of their asymptotic behavior on the
             training set. Under natural conditions, AdaBoost achieves an
             area under the ROC curve that is equally as good as
             RankBoost's; furthermore, RankBoost, when given a specific
             intercept, achieves a misclassification error that is as
             good as AdaBoost's. This may help to explain the empirical
             observations made by Cortes and Mohri, and Caruana and
             Niculescu-Mizil, about the excellent performance of AdaBoost
             as a bipartite ranking algorithm, as measured by the area
             under the ROC curve. © 2009 Cynthia Rudin and Robert E.
             Schapire.},
   Key = {fds330684}
}

@article{fds330685,
   Author = {Rudin, C},
   Title = {The P-norm push: A simple convex ranking algorithm that
             concentrates at the top of the list},
   Journal = {Journal of Machine Learning Research},
   Volume = {10},
   Pages = {2233-2271},
   Year = {2009},
   Month = {November},
   Abstract = {We are interested in supervised ranking algorithms that
             perform especially well near the top of the ranked list, and
             are only required to perform sufficiently well on the rest
             of the list. In this work, we provide a general form of
             convex objective that gives high-scoring examples more
             importance. This "push" near the top of the list can be
             chosen arbitrarily large or small, based on the preference
             of the user. We choose ℓp-norms to provide a specific type
             of push; if the user sets p larger, the objective
             concentrates harder on the top of the list. We derive a
             generalization bound based on the p-norm objective, working
             around the natural asymmetry of the problem. We then derive
             a boosting-style algorithm for the problem of ranking with a
             push at the top. The usefulness of the algorithm is
             illustrated through experiments on repository data. We prove
             that the minimizer of the algorithm's objective is unique in
             a specific sense. Furthermore, we illustrate how our
             objective is related to quality measurements for information
             retrieval. © 2009 Cynthia Rudin.},
   Key = {fds330685}
}

@article{fds330686,
   Author = {Passonneau, RJ and Rudin, C and Radeva, A and Liu,
             ZA},
   Title = {Reducing noise in labels and features for a real world
             dataset: Application of NLP corpus annotation
             methods},
   Journal = {Lecture Notes in Computer Science (including subseries
             Lecture Notes in Artificial Intelligence and Lecture Notes
             in Bioinformatics)},
   Volume = {5449 LNCS},
   Pages = {86-97},
   Publisher = {Springer Berlin Heidelberg},
   Year = {2009},
   Month = {July},
   ISBN = {9783642003813},
   url = {http://dx.doi.org/10.1007/978-3-642-00382-0_7},
   Abstract = {This paper illustrates how a combination of information
             extraction, machine learning, and NLP corpus annotation
             practice was applied to a problem of ranking vulnerability
             of structures (service boxes, manholes) in the Manhattan
             electrical grid. By adapting NLP corpus annotation methods
             to the task of knowledge transfer from domain experts, we
             compensated for the lack of operational definitions of
             components of the model, such as serious event. The machine
             learning depended on the ticket classes, but it was not the
             end goal. Rather, our rule-based document classification
             determines both the labels of examples and their feature
             representations. Changes in our classification of events led
             to improvements in our model, as reflected in the AUC scores
             for the full ranked list of over 51K structures. The
             improvements for the very top of the ranked list, which is
             of most importance for prioritizing work on the electrical
             grid, affected one in every four or five structures. ©
             Springer-Verlag Berlin Heidelberg 2009.},
   Doi = {10.1007/978-3-642-00382-0_7},
   Key = {fds330686}
}

@article{fds330687,
   Author = {Roth, R and Rambow, O and Habash, N and Diab, M and Rudin,
             C},
   Title = {Arabic morphological tagging, diacritization, and
             lemmatization using lexeme models and feature
             ranking},
   Journal = {ACL-08: HLT - 46th Annual Meeting of the Association for
             Computational Linguistics: Human Language Technologies,
             Proceedings of the Conference},
   Pages = {117-120},
   Year = {2008},
   Month = {January},
   ISBN = {9781932432046},
   url = {http://dx.doi.org/10.3115/1557690.1557721},
   Abstract = {We investigate the tasks of general morphological tagging,
             diacritization, and lemmatization for Arabic. We show that
             for all tasks we consider, both modeling the lexeme
             explicitly, and retuning the weights of individual
             classifiers for the specific task, improve the performance.
             © 2008 Association for Computational Linguistics.},
   Doi = {10.3115/1557690.1557721},
   Key = {fds330687}
}

@article{fds369839,
   Author = {Roth, R and Rambow, O and Habash, N and Diab, M and Rudin,
             C},
   Title = {Arabic morphological tagging, diacritization, and
             lemmatization using lexeme models and feature
             ranking},
   Journal = {Proceedings of the Annual Meeting of the Association for
             Computational Linguistics},
   Pages = {117-120},
   Year = {2008},
   Month = {January},
   Abstract = {We investigate the tasks of general morphological tagging,
             diacritization, and lemmatization for Arabic. We show that
             for all tasks we consider, both modeling the lexeme
             explicitly, and retuning the weights of individual
             classifiers for the specific task, improve the
             performance.},
   Key = {fds369839}
}

@article{fds330688,
   Author = {Rudin, C and Schapire, RE and Daubechies, I},
   Title = {Analysis of boosting algorithms using the smooth margin
             function},
   Journal = {Annals of Statistics},
   Volume = {35},
   Number = {6},
   Pages = {2723-2768},
   Publisher = {Institute of Mathematical Statistics},
   Year = {2007},
   Month = {December},
   url = {http://dx.doi.org/10.1214/009053607000000785},
   Abstract = {We introduce a useful tool for analyzing boosting algorithms
             called the "smooth margin function," a differentiable
             approximation of the usual margin for boosting algorithms.
             We present two boosting algorithms based on this smooth
             margin, "coordinate ascent boosting" and "approximate
             coordinate ascent boosting," which are similar to Freund and
             Schapire's AdaBoost algorithm and Breiman's arc-gv
             algorithm. We give convergence rates to the maximum margin
             solution for both of our algorithms and for arc-gv. We then
             study AdaBoost's convergence properties using the smooth
             margin function. We precisely bound the margin attained by
             AdaBoost when the edges of the weak classifiers fall within
             a specified range. This shows that a previous bound proved
             by Ratsch and Warmuth is exactly tight. Furthermore, we use
             the smooth margin to capture explicit properties of AdaBoost
             in cases where cyclic behavior occurs. © Institute of
             Mathematical Statistics, 2007.},
   Doi = {10.1214/009053607000000785},
   Key = {fds330688}
}

@article{fds330689,
   Author = {Rudin, C},
   Title = {Ranking with a P-norm push},
   Journal = {Lecture Notes in Computer Science (including subseries
             Lecture Notes in Artificial Intelligence and Lecture Notes
             in Bioinformatics)},
   Volume = {4005 LNAI},
   Pages = {589-604},
   Year = {2006},
   Month = {January},
   ISBN = {9783540352945},
   url = {http://dx.doi.org/10.1007/11776420_43},
   Abstract = {We are interested in supervised ranking with the following
             twist: our goal is to design algorithms that perform
             especially well near the top of the ranked list, and are
             only required to perform sufficiently well on the rest of
             the list. Towards this goal, we provide a general form of
             convex objective that gives high-scoring examples more
             importance. This "push" near the top of the list can be
             chosen to be arbitrarily large or small. We choose
             ℓp-norms to provide a specific type of push; as p becomes
             large, the algorithm concentrates harder near the top of the
             list. We derive a generalization bound based on the p-norm
             objective. We then derive a corresponding boosting-style
             algorithm, and illustrate the usefulness of the algorithm
             through experiments on UCI data. We prove that the minimizer
             of the objective is unique in a specific sense. ©
             Springer-Verlag Berlin Heidelberg 2006.},
   Doi = {10.1007/11776420_43},
   Key = {fds330689}
}

@article{fds376879,
   Author = {Ji, H and Rudin, C and Grishman, R},
   Title = {Re-Ranking Algorithms for Name Tagging},
   Journal = {HLT-NAACL 2006 - Computationally Hard Problems and Joint
             Inference in Speech and Language Processing, Proceedings of
             the Workshop},
   Pages = {49-56},
   Year = {2006},
   Month = {January},
   Abstract = {Integrating information from different stages of an NLP
             processing pipeline can yield significant error reduction.
             We demonstrate how re-ranking can improve name tagging in a
             Chinese information extraction system by incorporating
             information from relation extraction, event extraction, and
             coreference. We evaluate three state-of-the-art re-ranking
             algorithms (MaxEnt-Rank, SVMRank, and p-Norm Push Ranking),
             and show the benefit of multi-stage re-ranking for
             cross-sentence and cross-document inference.},
   Key = {fds376879}
}

@article{fds330690,
   Author = {Rudin, C and Cortes, C and Mohri, M and Schapire,
             RE},
   Title = {Margin-based ranking meets boosting in the
             middle},
   Journal = {Lecture Notes in Computer Science (including subseries
             Lecture Notes in Artificial Intelligence and Lecture Notes
             in Bioinformatics)},
   Volume = {3559 LNAI},
   Pages = {63-78},
   Publisher = {Springer Berlin Heidelberg},
   Year = {2005},
   Month = {January},
   ISBN = {9783540265566},
   url = {http://dx.doi.org/10.1007/11503415_5},
   Abstract = {We present several results related to ranking. We give a
             general margin-based bound for ranking based on the L∞
             covering number of the hypothesis space. Our bound suggests
             that algorithms that maximize the ranking margin generalize
             well. We then describe a new algorithm, Smooth Margin
             Ranking, that precisely converges to a maximum
             ranking-margin solution. The algorithm is a modification of
             RankBoost, analogous to Approximate Coordinate Ascent
             Boosting. We also prove a remarkable property of AdaBoost:
             under very natural conditions, AdaBoost maximizes the
             exponentiated loss associated with the AUC and achieves the
             same AUC as RankBoost. This explains the empirical
             observations made by Cortes and Mohri, and Caruana and
             Niculescu-Mizil, about the excellent performance of AdaBoost
             as a ranking algorithm, as measured by the AUC. ©
             Springer-Verlag Berlin Heidelberg 2005.},
   Doi = {10.1007/11503415_5},
   Key = {fds330690}
}

@article{fds330692,
   Author = {Rudin, C and Daubechies, I and Schapire, RE},
   Title = {The dynamics of AdaBoost: Cyclic behavior and convergence of
             margins},
   Journal = {Journal of Machine Learning Research},
   Volume = {5},
   Pages = {1557-1595},
   Year = {2004},
   Month = {December},
   Abstract = {In order to study the convergence properties of the AdaBoost
             algorithm, we reduce AdaBoost to a nonlinear iterated map
             and study the evolution of its weight vectors. This
             dynamical systems approach allows us to understand
             AdaBoost's convergence properties completely in certain
             cases; for these cases we find stable cycles, allowing us to
             explicitly solve for AdaBoost's output. Using this unusual
             technique, we are able to show that AdaBoost does not always
             converge to a maximum margin combined classifier, answering
             an open question. In addition, we show that "nonoptimal"
             AdaBoost (where the weak learning algorithm does not
             necessarily choose the best weak classifier at each
             iteration) may fail to converge to a maximum margin
             classifier, even if "optimal" AdaBoost produces a maximum
             margin. Also, we show that if AdaBoost cycles, it cycles
             among "support vectors", i.e., examples that achieve the
             same smallest margin.},
   Key = {fds330692}
}

@article{fds330691,
   Author = {Rudin, C and Schapire, RE and Daubechies, I},
   Title = {Boosting based on a smooth margin},
   Journal = {Lecture Notes in Artificial Intelligence (Subseries of
             Lecture Notes in Computer Science)},
   Volume = {3120},
   Pages = {502-517},
   Publisher = {Springer Berlin Heidelberg},
   Year = {2004},
   Month = {January},
   url = {http://dx.doi.org/10.1007/978-3-540-27819-1_35},
   Abstract = {We study two boosting algorithms, Coordinate Ascent Boosting
             and Approximate Coordinate Ascent Boosting, which are
             explicitly designed to produce maximum margins. To derive
             these algorithms, we introduce a smooth approximation of the
             margin that one can maximize in order to produce a maximum
             margin classifier. Our first algorithm is simply coordinate
             ascent on this function, involving a line search at each
             step. We then make a simple approximation of this line
             search to reveal our second algorithm. These algorithms are
             proven to asymptotically achieve maximum margins, and we
             provide two convergence rate calculations. The second
             calculation yields a faster rate of convergence than the
             first, although the first gives a more explicit (still fast)
             rate. These algorithms are very similar to AdaBoost in that
             they are based on coordinate ascent, easy to implement, and
             empirically tend to converge faster than other boosting
             algorithms. Finally, we attempt to understand AdaBoost in
             terms of our smooth margin, focusing on cases where AdaBoost
             exhibits cyclic behavior.},
   Doi = {10.1007/978-3-540-27819-1_35},
   Key = {fds330691}
}

@article{fds330693,
   Author = {Rudin, C and Daubechies, I and Schapire, RE},
   Title = {On the dynamics of boosting},
   Journal = {Advances in Neural Information Processing
             Systems},
   Pages = {1101-1108},
   Publisher = {M I T PRESS},
   Editor = {Thrun, S and Saul, LK and Schölkopf, B},
   Year = {2004},
   Month = {January},
   ISBN = {9780262201520},
   Abstract = {In order to understand AdaBoost's dynamics, especially its
             ability to maximize margins, we derive an associated
             simplified nonlinear iterated map and analyze its behavior
             in low-dimensional cases. We find stable cycles for these
             cases, which can explicitly be used to solve for Ada-
             Boost's output. By considering AdaBoost as a dynamical
             system, we are able to prove R̈atsch and Warmuth's
             conjecture that AdaBoost may fail to converge to a
             maximal-margin combined classifier when given a 'nonoptimal'
             weak learning algorithm. AdaBoost is known to be a
             coordinate descent method, but other known algorithms that
             explicitly aim to maximize the margin (such as AdaBoost and
             arc-gv) are not. We consider a differentiable function for
             which coordinate ascent will yield a maximum margin
             solution. We then make a simple approximation to derive a
             new boosting algorithm whose updates are slightly more
             aggressive than those of arcgv.},
   Key = {fds330693}
}
dept@math.duke.edu
ph: 919.660.2800
fax: 919.660.2821
Mathematics Department
Duke University, Box 90320
Durham, NC 27708-0320
Mathematics Home | Arts & Sciences | Duke University