<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e66200</article-id><article-id pub-id-type="doi">10.2196/66200</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>A Responsible Framework for Assessing, Selecting, and Explaining Machine Learning Models in Cardiovascular Disease Outcomes Among People With Type 2 Diabetes: Methodology and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Yang</surname><given-names>Yang</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Liao</surname><given-names>Che-Yi</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Keyvanshokooh</surname><given-names>Esmaeil</given-names></name><degrees>MS, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shao</surname><given-names>Hui</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Weber</surname><given-names>Mary Beth</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pasquel</surname><given-names>Francisco J</given-names></name><degrees>MPH, MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Garcia</surname><given-names>Gian-Gabriel P</given-names></name><degrees>MS, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>H. Milton Stewart School of Industrial and Systems Engineering, Georgia Institute of Technology</institution><addr-line>765 Ferst Dr NW</addr-line><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Information and Operations Management, Mays Business School, Texas A&#x0026;M University</institution><addr-line>College Station</addr-line><addr-line>TX</addr-line><country>United States</country></aff><aff id="aff3"><institution>Hubert Department of Global Health, Rollins School of Public Health, Emory University</institution><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><aff id="aff4"><institution>Division of Endocrinology, Metabolism, and Lipids, Department of Medicine, Emory University School of Medicine</institution><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Militello</surname><given-names>Carmelo</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wiberg</surname><given-names>Holly</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Gian-Gabriel P Garcia, MS, PhD, H. Milton Stewart School of Industrial and Systems Engineering, Georgia Institute of Technology, 765 Ferst Dr NW, Atlanta, GA, 30332-0001, United States, 1 404-385-3140; <email>giangarcia@gatech.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>27</day><month>6</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e66200</elocation-id><history><date date-type="received"><day>10</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>21</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>26</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Yang Yang, Che-Yi Liao, Esmaeil Keyvanshokooh, Hui Shao, Mary Beth Weber, Francisco J Pasquel, Gian-Gabriel P Garcia. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 27.6.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e66200"/><abstract><sec><title>Background</title><p>Building machine learning models that are interpretable, explainable, and fair is critical for their trustworthiness in clinical practice. Interpretability, which refers to how easily a human can comprehend the mechanism by which a model makes predictions, is often seen as a primary consideration when adopting a machine learning model in health care. However, interpretability alone does not necessarily guarantee explainability, which offers stakeholders insights into a model&#x2019;s predicted outputs. Moreover, many existing frameworks for model evaluation focus primarily on maximizing predictive accuracy, overlooking the broader need for interpretability, fairness, and explainability.</p></sec><sec><title>Objective</title><p>This study proposes a 3-stage machine learning framework for responsible model development through model assessment, selection, and explanation. We demonstrate the application of this framework for predicting cardiovascular disease (CVD) outcomes, specifically myocardial infarction (MI) and stroke, among people with type 2 diabetes (T2D).</p></sec><sec sec-type="methods"><title>Methods</title><p>We extracted participant data comprised of people with T2D from the ACCORD (Action to Control Cardiovascular Risk in Diabetes) dataset (N=9635), including demographic, clinical, and biomarker records. Then, we applied hold-out cross-validation to develop several interpretable machine learning models (linear, tree-based, and ensemble) to predict the risks of MI and stroke among patients with diabetes. Our 3-stage framework first assesses these models via predictive accuracy and fairness metrics. Then, in the model selection stage, we quantify the trade-off between accuracy and fairness using area under the curve (AUC) and Relative Parity of Performance Scores (RPPS), wherein RPPS measures the greatest deviation of all subpopulations compared with the population-wide AUC. Finally, we quantify the explainability of the chosen models using methods such as SHAP (Shapley Additive Explanations) and partial dependence plots to investigate the relationship between features and model outputs.</p></sec><sec sec-type="results"><title>Results</title><p>Our proposed framework demonstrates that the GLMnet model offers the best balance between predictive performance and fairness for both MI and stroke. For MI, GLMnet achieves the highest RPPS (0.979 for gender and 0.967 for race), indicating minimal performance disparities, while maintaining a high AUC of 0.705. For stroke, GLMnet has a relatively high AUC of 0.705 and the second-highest RPPS (0.961 for gender and 0.979 for race), suggesting it is effective across both subgroups. Our model explanation method further highlights that the history of CVD and age are the key predictors of MI, while HbA<sub>1c</sub> and systolic blood pressure significantly influence stroke classification.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study establishes a responsible framework for assessing, selecting, and explaining machine learning models, emphasizing accuracy-fairness trade-offs in predictive modeling. Key insights include: (1) simple models perform comparably to complex ensembles; (2) models with strong accuracy may harbor substantial differences in accuracy across demographic groups; and (3) explanation methods reveal the relationships between features and risk for MI and stroke. Our results underscore the need for holistic approaches that consider accuracy, fairness, and explainability in interpretable model design and selection, potentially enhancing health care technology adoption.</p></sec></abstract><kwd-group><kwd>interpretable machine learning</kwd><kwd>explainability</kwd><kwd>fairness</kwd><kwd>type 2 diabetes</kwd><kwd>cardiovascular disease</kwd><kwd>responsible framework</kwd><kwd>cardiovascular</kwd><kwd>cardiology</kwd><kwd>machine learning</kwd><kwd>diabetes</kwd><kwd>clinical practice</kwd><kwd>myocardial infarction</kwd><kwd>MI</kwd><kwd>stroke</kwd><kwd>prediction</kwd><kwd>T2D</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Building trustworthy machine learning models for clinical practice requires consideration of interpretability, explainability, as well as fairness. Interpretability&#x2014;which refers to how easily a human can comprehend the mechanism by which a model makes predictions&#x2014;is important in health care settings because of the need for clinicians and patients to understand and trust the Artificial Intelligence (AI)-involved decisions that directly impact patient care [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. It also facilitates regulatory compliance and ethical considerations in medical AI applications, ensuring these systems are not only effective but also justifiable and accountable [<xref ref-type="bibr" rid="ref3">3</xref>]. In addressing this pressing need, experts in computer science, operations research, and medical informatics have significantly progressed the field of interpretable machine learning models, laying the foundation for the development of AI [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. As of today, cutting-edge interpretable machine learning models are available through many open-source software packages, including RiskSLIM and Interpretable AI (Interpretable AI), among others [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref><xref ref-type="bibr" rid="ref7">7</xref>]. Nonetheless, interpretability alone is not sufficient for trustworthy AI in health care.</p><p>Beyond interpretability, the machine learning community has also begun to emphasize the need for explainability [<xref ref-type="bibr" rid="ref8">8</xref>] which focuses on conveying understandable reasons behind AI-driven decisions. While interpretability helps users grasp how a model arrives at a conclusion, explainability provides the why, offering justifications in human terms. This reasoning is crucial in health care, where clinicians and patients must not only understand but also trust the rationale of AI suggestions [<xref ref-type="bibr" rid="ref9">9</xref>]. Therefore, explainability builds trust, enhances decision-making quality by providing insights into AI reasoning, and ensures compliance with ethical and legal standards [<xref ref-type="bibr" rid="ref10">10</xref>]. In other words, explainability plays a crucial role in making machine learning algorithms and AI not just transparent but also relatable and trustworthy in clinical settings.</p><p>Meanwhile, there has been an increasing worry about the possibility of machine learning models leading to biased decisions [<xref ref-type="bibr" rid="ref11">11</xref>]. Examples include models displaying racial or gender biases in predicting patient outcomes [<xref ref-type="bibr" rid="ref12">12</xref>], or algorithms that disproportionately favor certain demographics in resource allocation [<xref ref-type="bibr" rid="ref13">13</xref>]. Such biased decision-making tools may result in unfair evaluations in clinical settings, ultimately harming patients who require care [<xref ref-type="bibr" rid="ref14">14</xref>]. However, efforts to quantitatively evaluate fairness in prediction models for clinical practice are still scarce [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>A model with high predictive accuracy does not guarantee the best clinical usage, as it might display unfavorable biases [<xref ref-type="bibr" rid="ref16">16</xref>]. As a result, it is important to understand and quantify the trade-offs between accuracy and fairness in model selection.</p><p>Overall, the combined exploration and consideration of these aspects in a machine learning framework-based environment is not thoroughly investigated in this literature [<xref ref-type="bibr" rid="ref17">17</xref>]. Therefore, to address these issues systematically, we propose a 3-stage machine learning framework on model assessment, selection, and explainability that integrates interpretability, fairness, as well as explainability in health care decision-making. Specifically, in the first stage, we develop and assess a range of models based on predictive accuracy and fairness. Next, we select the model that best balances accuracy and fairness using a novel trade-off curve. Finally, we explain the chosen model, aiming to provide deeper insights into its predictions for informed clinical decision-making. As a proof of concept, we apply our framework to predict cardiovascular disease (CVD) outcomes, myocardial infarction (MI), and stroke, among people with type 2 diabetes (T2D). With CVD being a leading cause of death in the United States, and patients with T2D being at elevated risk of CVD, it is urgent to develop accurate and fair predictive models that generate clinically reasonable predictions [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. This study not only contributes to the advancement of AI in health care but also sets a precedent for future research in incorporating interpretability, fairness, and explainability into the machine learning model development framework, paving the way for more ethical, trustworthy, and effective solutions in medical informatics.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>In this section, we first detail our study data and model development approach. Then, we describe our responsible framework for model assessment, selection, and explanation. This study is based on a secondary analysis of individual-level participant data from the Action to Control Cardiovascular Risk in Diabetes study (ACCORD 2001&#x2010;09, NCT00000620) by the National Heart, Lung, and Blood Institute (NHLBI) of United States [<xref ref-type="bibr" rid="ref15">15</xref>]. Our code repository is included in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-2"><title>Data</title><p>ACCORD (Action to Control Cardiovascular Risk in Diabetes) was a randomized, multicenter, double 2&#x00D7;2 factorial design study conducted at 77 clinical sites in North America. Participants were aged between 40 and 79 years, had T2D with a hemoglobin A1c (HbA<sub>1c</sub>) &#x2265; 7.5% (57 mmol/mol), and had previous evidence of CVD or cardiovascular risk factors. (eg, dyslipidemia, hypertension, smoking, or obesity). The primary outcome of ACCORD was determined based on the first instance of a significant CVD event, which was characterized by a combination of nonfatal MI, nonfatal stroke, or cardiovascular death. We extracted demographic, clinical, and biomarker data collected at baseline (study entry) from individual participants across trials for model development.</p></sec><sec id="s2-3"><title>Data Preprocessing</title><p>We focused on 2 primary CVD events, MI and stroke, as our study outcomes, and used patients&#x2019; demographic data, clinical risk factors, medication history, and pertinent biomarkers as predictors. All predictor variables were collected at the time of study enrollment, ensuring that our models use information available at the point of care. We prepared the study outcomes as binary variables: patients either experienced MI or stroke within the 5-year period, or they did not. In other words, the interpretable machine learning models act as classification tools to identify if a patient is at risk of experiencing these CVD events in the next 5 years.</p><p>We sourced candidate predictor variables for fatal or nonfatal MI and fatal or nonfatal stroke outcomes from eligibility screening or clinical examination data in ACCORD after applying the inclusion criteria. These predictors encompass demographic characteristics, clinical factors, medication history, and relevant biomarkers. Complete case models were constructed using all predictor variables, without using imputation, as only 616 out of 10,251 observations (6%) in the dataset had missing values across any of the predictors. We dropped the records with missing data and applied one-hot encoding for categorical predictors to obtain the final dataset for model development [<xref ref-type="bibr" rid="ref20">20</xref>]. Specifically, creating dummy variables involved transforming categorical variables, such as treatment type (eg, intensive vs standard glycemic therapy) and medication history (eg, blood pressure- or lipid-lowering treatments), into binary indicators via one-hot encoding [<xref ref-type="bibr" rid="ref21">21</xref>]. This step ensured that the categorical data was appropriately formatted for model development. This data-preprocessing pipeline is outlined in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Data-preprocessing steps before model development.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66200_fig01.png"/></fig></sec><sec id="s2-4"><title>Machine Learning Model Development</title><p>We developed several machine learning models to demonstrate our proposed framework for fairness-aware model assessment and selection. Our framework is specifically designed for scenarios where model development, selection, and deployment are treated as separate processes. The machine learning models we consider include interpretable models (eg, linear and tree-based), semi-interpretable models, (eg, random forest), and common statistical and machine learning models, (eg, na&#x00EF;ve Bayes), for binary classification of MI and stroke. In the following paragraphs, we provide details of the models developed in this research and describe the model-tuning procedure.</p><p>We evaluated a range of machine learning models, including linear models (GLMnet and OFS), tree-based models (CART and OCT), ensemble models (random forest and XGBoost), and other traditional machine learning approaches (na&#x00EF;ve Bayes and SVM). Linear and tree-based models are generally considered interpretable due to their structure and parameterization [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], while ensemble models and other methods can achieve strong predictive performance but may be less directly interpretable. OFS formulates the logistic regression with L2 penalties into a binary convex optimization problem and solves it to optimality. Within the OFS framework, there are 2 key parameters: the regularization parameter (balancing model complexity against accuracy) and the sparsity parameter (enhancing interpretability by controlling feature count) [<xref ref-type="bibr" rid="ref24">24</xref>]. OCT derives the tree by optimizing the tree structure (size) and decision rules simultaneously via mixed integer optimization [<xref ref-type="bibr" rid="ref25">25</xref>]. The main hyperparameters in OCT include the maximum depth of a tree, the minimum leaf size, and the complexity parameter, playing a crucial role in preventing overfitting, ensuring stability, and fostering interpretability. We limited the max depth of an OCT to 3 and 4 to enhance interpretability by simplifying the decision structure. We remark that OCT exactly recovers the optimal tree (given fixed hyperparameters) at the cost of additional computational complexity, whereas CART uses heuristic splitting rules in branching nodes to generate a decision tree quickly. Detailed descriptions of each model&#x2019;s structure, key hyperparameters, and training procedure are provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><p>To build our models, we randomly divided our data using a 70&#x2010;30 train-test split, using 6745 out of 9635 for training and the remaining 2890 out of 9635 for testing. Then, with the training data, we applied 10-fold cross-validation for hyperparameter tuning. We also found that our data were class-imbalanced, and there were very few occurrences of CVD events. To address this class imbalance, we adjusted the weight assigned to each label during our hyperparameter tuning. Each model&#x2019;s performance was estimated using both cross-validation on the training set and out-of-sample validation on the testing set.</p></sec><sec id="s2-5"><title>Responsible Framework for Model Assessment, Selection, and Explanation</title><p>After building the machine learning models, it is desired to select a suitable model and investigate the relationship between predictors and the outcomes. In this section, we detail our responsible framework for model assessment, selection, and explanation. We outline this framework in <xref ref-type="fig" rid="figure2">Figure 2</xref>. Our proposed responsible framework consists of three main modules after model development: (1) model assessment, (2) model selection, and (3) model explanation. In the model assessment module, we assess each model&#x2019;s performance in their predictive capabilities and fairness. Then, a sensitivity analysis of the trade-off between these performance metrics is carried out to aid model selection. Finally, for model explanation, we use a unified approach that combines multiple methods to explain the best-performing models. We developed our proposed responsible framework with R language (v4.3.1, R Foundation) and the following R libraries: Interpretable AI (v3.2.1, Interpretable AI), survival (v3.5.5, Mayo Clinic), GLMnet (v4.1.8, Stanford University), rpart (v4.1.19, Mayo Clinic), naivebayes (v1.0.0), and kernlab (v0.9.32, TU Wien).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The proposed responsible framework for model assessment, selection, and explanation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66200_fig02.png"/></fig><sec id="s2-5-1"><title>Model Assessment</title><p>In this module, we describe the model assessment procedure for the developed interpretable models according to their predictive performance and fairness. To evaluate predictive accuracy, we considered four metrics: (1) area under the receiver operating characteristic curve (AUC), (2) sensitivity, (3) specificity, and (4) accuracy. AUC evaluates a model&#x2019;s ability to distinguish between positive and negative classes, with higher values indicating better discriminative performance. It is particularly reliable for imbalanced data due to its threshold-independence. Sensitivity (true positive rate) measures the correct identification of actual positives (ie, patients developed MI or stroke), while specificity (true negative rate) assesses the correct identification of negatives. Accuracy quantifies the proportion of correct classification of the patients in the total cases examined. Sensitivity, specificity, and accuracy are threshold-dependent for models that output continuous scores (eg, predicted probabilities). Importantly, it is common to carefully select this classification threshold to optimize the model&#x2019;s predictive performance, with respect to these performance metrics for specific applications. To address this challenge, we set a threshold which maximizes a weighted metric combining sensitivity and specificity, that is, <inline-formula><mml:math id="ieqn1"><mml:mi>u</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi><mml:mo>+</mml:mo><mml:mfenced separators="|"><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>u</mml:mi></mml:mrow></mml:mfenced><mml:mo>&#x00D7;</mml:mo><mml:mi>s</mml:mi><mml:mi>p</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi></mml:math></inline-formula>. Here, the weight parameter <inline-formula><mml:math id="ieqn2"><mml:mi>u</mml:mi></mml:math></inline-formula> is bounded between 0 and 1, and higher (resp., lower) values of <inline-formula><mml:math id="ieqn3"><mml:mi>u</mml:mi></mml:math></inline-formula> indicate a preference towards thresholds that emphasize sensitivity (resp., specificity). This approach has also been used in previous research [<xref ref-type="bibr" rid="ref26">26</xref>] . In CVD management, both sensitivity and specificity play important roles in correctly identifying patients who are at risk of MI or stroke, along with those who are at low risk of these outcomes. Because correctly identifying those at high risk of MI or stroke is critical to initiating clinical interventions, we specifically use a value of <inline-formula><mml:math id="ieqn4"><mml:mi>u</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:mfrac></mml:math></inline-formula> in our analysis &#x2013; leaning slightly toward higher sensitivity over specificity. This approach ensures that we assess a balanced performance in identifying both patients at risk and with low risk while accounting for clinical priorities.</p><p>To evaluate the predictive fairness of the machine learning models, we consider a fairness metric, the relative performance parity score (RPPS)<italic>,</italic> which is calculated by</p><disp-formula id="E1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>R</mml:mi><mml:mi>P</mml:mi><mml:mi>P</mml:mi><mml:mi>S</mml:mi><mml:mo>:=</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:munder><mml:mrow/><mml:munder><mml:mrow/><mml:mi>s</mml:mi></mml:munder></mml:munder><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mfrac><mml:mrow><mml:mi>A</mml:mi><mml:mi>U</mml:mi><mml:msub><mml:mi>C</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>A</mml:mi><mml:mi>U</mml:mi><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>A</mml:mi><mml:mi>U</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn5"><mml:mi>s</mml:mi></mml:math></inline-formula> is a subpopulation in a protected attribute, eg, female in gender, and <inline-formula><mml:math id="ieqn6"><mml:mi>A</mml:mi><mml:mi>U</mml:mi><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the conditional AUC conditioned on this subpopulation <inline-formula><mml:math id="ieqn7"><mml:mi>s</mml:mi></mml:math></inline-formula>. Notably, RPPS can be small when any subpopulation has disproportionally high or low AUC, compared with the overall AUC. On the other hand, if all subpopulations have AUC performance close to the overall AUC, then the RPPS will be large. We chose AUC as the primary metric because it is threshold-agnostic, providing a more comprehensive measure of model performance across different decision thresholds. Since this metric is a relative measure specific to each model&#x2019;s output, it is suitable for a fair comparison across models, which befits our purpose in the model assessment module of the proposed responsible model selection framework. Moreover, the RPPS is flexible and can accommodate other commonly used performance metrics, such as accuracy or sensitivity, depending on the specific goals of the fairness evaluation.</p></sec><sec id="s2-5-2"><title>Model Selection</title><p>After evaluating the models, selection could be based on either predictive performance or fairness. Ideally, one would choose a model that excels in both dimensions. However, this selection process becomes challenging when no such model exists within the considered model options. To address this, we propose a sensitivity analysis-based approach to enhance model selection. Specifically, we evaluated the weighted sum of accuracy <italic>and RPPS</italic>, with the weight ranging from 0 to 1, that is, <inline-formula><mml:math id="ieqn8"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>w</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mi>R</mml:mi><mml:mi>P</mml:mi><mml:mi>P</mml:mi><mml:mi>S</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula>,</p><p>where <inline-formula><mml:math id="ieqn9"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>w</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula> is the prespecified weight between 0 and 1. Notably, when the weight is 1, this weighted sum simplifies to accuracy; conversely, when the weight is 0, it becomes the RPPS. This approach enables model selection according to the trade-off between predictive performance and fairness.</p></sec><sec id="s2-5-3"><title>Model Explanation</title><p>To investigate the relationship between the predictors and outcomes, we developed a synergistic model explanation approach that combines models&#x2019; permutation variable importance, the SHAP method, and partial dependence plots. Notably, this explanation module does not establish clear causal relationships between features and adverse health outcomes; however, it helps clarify how the ML algorithms function for decision-making. Permutation variable importance assesses model explainability through feature significance [<xref ref-type="bibr" rid="ref27">27</xref>]. That is, it measures the impact of each feature on a model&#x2019;s predictive performance by shuffling the values of a feature while keeping others constant. We can then determine that feature&#x2019;s importance based on the resulting performance decline, as measured by AUC. To enhance the reliability of our estimates, we bootstrapped 100 iterations: in each, we sampled the training data, trained a model, computed a baseline AUC, and determined permutation importance scores. This yielded multiple score sets for each bootstrapped sample. We then averaged the feature importance and provided 95% CIs. Features were ranked by mean importance and variability. A high mean importance means the model heavily depends on that feature, whereas high variability indicates inconsistent significance. Therefore, features with high variability warrant further examination, while those with high importance and low variability are consistently crucial.</p><p>Another component in our model explanation is the SHAP method. Essentially, SHAP assigns an importance measure, known as the Shapley value, to each feature. This Shapley value is calculated by averaging the differences in the model&#x2019;s predictions with and without the feature across all possible subsets of features, which can be viewed as the expected effects of the feature on the prediction. Importantly, the sum of all Shapley values equals the prediction value to ensure consistency across all features. The Shapley value effectively captures the average marginal contribution of each feature, providing a comprehensive explanation of the model&#x2019;s behavior. For interpretability, we proposed to use the relative Shapley value (the marginal contribution of each feature relative to the prediction value) in our analysis.</p><p>To further visualize and understand the relationship between features and predicted outcomes, we also consider Partial Dependence plots as another explanation method. Partial Dependence plots are widely used and show how changing a feature value affects model outputs, by fixing all other features [<xref ref-type="bibr" rid="ref28">28</xref>]. This explanation method is chosen over the SHAP&#x2019;s built-in dependence plot function because Partial Dependence plots tend to be more intuitive in clinical settings [<xref ref-type="bibr" rid="ref29">29</xref>]. Specifically, when creating a partial dependence plot, we replace the value of a feature with values in its range to compute average model outputs on this feature&#x2019;s range across data samples. In our analysis, in addition to risk predictions, we used log odds as model outcomes to provide a better interpretation of features and outcomes. In essence, a positive log-odds value indicates a higher likelihood of the event occurring, a negative value indicates a lower likelihood, and a value of zero represents a 50% probability. We made the necessary adjustments for categorical and continuous features and included data distribution information to bolster the reliability of our analysis. This approach enables us to pinpoint areas where the model&#x2019;s output is robustly supported by data, as well as identify regions where predictions may be less dependable due to data scarcity.</p><p>We stress that both permutation variable importance and the SHAP method can provide broader insights into the overall effects of specific features on model outputs. Partial dependence plots, on the other hand, provide a detailed visualization to uncover the direct relationship between each feature and the predicted outcome. By integrating permutation variable importance, the SHAP method, and partial dependence plots, we deliver a holistic model explanation for users. This synergy enhances interpretability and trust in the model&#x2019;s predictions, making the analysis more actionable in clinical settings.</p></sec></sec><sec id="s2-6"><title>Ethical Consideration</title><p>This study was approved by the Institutional Review Board at the Georgia Institute of Technology under Protocol No. H22333. All participants in the ACCORD trial provided written informed consent. The ACCORD contained de-identified data only. Participants in the ACCORD trial were not paid for their participation.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Study Sample</title><p>Our study data included 9635 participants, with 616 (6% of the total 10,251) excluded due to missing data on predictor variables (<xref ref-type="table" rid="table1">Table 1</xref>). The mean (SD) age was 62.8 (6.7) years. Women made up 3,662 (38%) of the sample. The racial and ethnic make-up of our study data included 1834 (19%) non-Hispanic Black participants, 678 (7%) Hispanic or Latino participants, and 7123 (74%) non-Hispanic White participants. In addition, 3437 (36%) of participants had a history of CVD. The mean (SD) body mass index was 32.2 (5.4) kg/m<sup>2</sup>, systolic blood pressure was 136.5 (17.1) mm Hg, and diastolic blood pressure was 74.9 (10.7) mm Hg. Of the sample, 880 (9.1%) experienced an MI, and 197 (2%) had a stroke during the follow-up period.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of study sample characteristics.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable</td><td align="left" valign="bottom">Value (n=9635)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Demographics</td></tr><tr><td align="left" valign="top">Age, years, mean (SD)</td><td align="left" valign="top">62.8 (6.66)</td></tr><tr><td align="left" valign="top">Aged 75 years or older, n (%)</td><td align="left" valign="top">521 (5.4)</td></tr><tr><td align="left" valign="top">Gender, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Women</td><td align="left" valign="top">3662 (38)</td></tr><tr><td align="left" valign="top">&#x2003;Men</td><td align="left" valign="top">5973 (62)</td></tr><tr><td align="left" valign="top">Race and ethnicity, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Non-Hispanic Black</td><td align="left" valign="top">1834 (19)</td></tr><tr><td align="left" valign="top">&#x2003;Hispanic or Latino</td><td align="left" valign="top">678 (7)</td></tr><tr><td align="left" valign="top">&#x2003;Non-Hispanic White</td><td align="left" valign="top">7123 (74)</td></tr><tr><td align="left" valign="top">Tobacco usage, current, n (%)</td><td align="left" valign="top">1179 (12)</td></tr><tr><td align="left" valign="top">BMI, kg/m<sup>2</sup>, mean (SD)</td><td align="left" valign="top">32.2 (5.4)</td></tr><tr><td align="left" valign="top">Blood pressure, mean (SD)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Systolic, mm Hg</td><td align="left" valign="top">136.5 (17.1)</td></tr><tr><td align="left" valign="top">&#x2003;Diastolic, mm Hg</td><td align="left" valign="top">74.9 (10.7)</td></tr><tr><td align="left" valign="top">Heart rate, bpm, mean (SD)</td><td align="left" valign="top">72.7 (11.8)</td></tr><tr><td align="left" valign="top">History of CVD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, n (%)</td><td align="left" valign="top">3437 (36)</td></tr><tr><td align="left" valign="top">Drug usage, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;Blood pressure-lowering drugs</td><td align="left" valign="top">8109 (94)</td></tr><tr><td align="left" valign="top">&#x2003;Oral diabetes drugs (including metformin)</td><td align="left" valign="top">8024 (83)</td></tr><tr><td align="left" valign="top">&#x2003;Insulin treatment</td><td align="left" valign="top">3403 (35)</td></tr><tr><td align="left" valign="top">&#x2003;Statins</td><td align="left" valign="top">6148 (64)</td></tr><tr><td align="left" valign="top">&#x2003;Fibrates</td><td align="left" valign="top">601 (6)</td></tr><tr><td align="left" valign="top">&#x2003;Anticoagulant use</td><td align="left" valign="top">303 (3)</td></tr><tr><td align="left" valign="top">&#x2003;Nonsteroidal anti-inflammatory use</td><td align="left" valign="top">851 (9)</td></tr><tr><td align="left" valign="top">&#x2003;Platelet aggregate inhibitor use</td><td align="left" valign="top">466 (5)</td></tr><tr><td align="left" valign="top">&#x2003;Daily aspirin use</td><td align="left" valign="top">5274 (55)</td></tr><tr><td align="left" valign="top">Biomarkers, mean (SD)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">&#x2003;HbA<sub>1c</sub><sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, %</td><td align="left" valign="top">8.3 (1.1)</td></tr><tr><td align="left" valign="top">&#x2003;HbA<sub>1c</sub>, mmol/mol</td><td align="left" valign="top">67 (9)</td></tr><tr><td align="left" valign="top">&#x2003;Total cholesterol, mg/dL</td><td align="left" valign="top">183.2 (41.7)</td></tr><tr><td align="left" valign="top">&#x2003;HDL cholesterol, mg/dL</td><td align="left" valign="top">41.8 (11.6)</td></tr><tr><td align="left" valign="top">&#x2003;LDL cholesterol, mg/dL</td><td align="left" valign="top">104.7 (33.8)</td></tr><tr><td align="left" valign="top">&#x2003;Triglycerides, mg/dL</td><td align="left" valign="top">190.7 (145.8)</td></tr><tr><td align="left" valign="top">&#x2003;Fasting plasma glucose, mg/dL</td><td align="left" valign="top">175.3 (55.8)</td></tr><tr><td align="left" valign="top">&#x2003;Alanine aminotransferase, IU/L</td><td align="left" valign="top">27.5 (16.0)</td></tr><tr><td align="left" valign="top">&#x2003;Creatine phosphokinase, IU/L</td><td align="left" valign="top">140.3 (130.2)</td></tr><tr><td align="left" valign="top">&#x2003;Serum potassium, mmol/L</td><td align="left" valign="top">4.5 (0.5)</td></tr><tr><td align="left" valign="top">&#x2003;Serum creatinine, mg/dL</td><td align="left" valign="top">0.9 (0.2)</td></tr><tr><td align="left" valign="top">&#x2003;Estimated glomerular filtration rate, mL/min/1.73 m<sup>2</sup></td><td align="left" valign="top">90.9 (27.3)</td></tr><tr><td align="left" valign="top">&#x2003;Urine albumin, mg/dL</td><td align="left" valign="top">10.7 (37.3)</td></tr><tr><td align="left" valign="top">&#x2003;Urine creatinine, mg/dL</td><td align="left" valign="top">127.3 (65.4)</td></tr><tr><td align="left" valign="top">&#x2003;Urine albumin: creatinine ratio, mg/g</td><td align="left" valign="top">99.2 (359.4)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>CVD: cardiovascular disease.</p></fn><fn id="table1fn2"><p><sup>b</sup>HbA<sub>1c</sub>: Hemoglobin A1c.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Predictive Performance</title><p>The predictive accuracy of our machine learning models is summarized in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Summary of predictive performance for machine learning models predicting risk of myocardial infarction and stroke, optimizing thresholds for <inline-formula><mml:math id="ieqn10"><mml:mfrac><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mi mathvariant="normal">S</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">y</mml:mi><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:mfrac><mml:mo>&#x00D7;</mml:mo><mml:mi mathvariant="normal">S</mml:mi><mml:mi mathvariant="normal">p</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">f</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">y</mml:mi></mml:math></inline-formula>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Machine learning model</td><td align="left" valign="top">AUC</td><td align="left" valign="top">Sensitivity</td><td align="left" valign="top">Specificity</td><td align="left" valign="top">Accuracy</td></tr></thead><tbody><tr><td align="left" valign="top">MI<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OCT</td><td align="left" valign="top">0.687</td><td align="left" valign="top">0.782<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.554</td><td align="left" valign="top">0.565</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Na&#x00EF;ve Bayes</td><td align="left" valign="top">0.694</td><td align="left" valign="top">0.563</td><td align="left" valign="top">0.768<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.420</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Random forest</td><td align="left" valign="top">0.716<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.746</td><td align="left" valign="top">0.640</td><td align="left" valign="top">0.645</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SVM</td><td align="left" valign="top">0.581</td><td align="left" valign="top">0.711</td><td align="left" valign="top">0.436</td><td align="left" valign="top">0.450</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.695</td><td align="left" valign="top">0.782</td><td align="left" valign="top">0.554</td><td align="left" valign="top">0.565</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GLMnet</td><td align="left" valign="top">0.704</td><td align="left" valign="top">0.629</td><td align="left" valign="top">0.666</td><td align="left" valign="top">0.664<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OFS</td><td align="left" valign="top">0.705</td><td align="left" valign="top">0.671</td><td align="left" valign="top">0.642</td><td align="left" valign="top">0.644</td></tr><tr><td align="left" valign="top">Stroke</td><td align="left" valign="top" colspan="4"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OCT</td><td align="left" valign="top">0.625</td><td align="left" valign="top">0.771</td><td align="left" valign="top">0.546</td><td align="left" valign="top">0.550</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Na&#x00EF;ve Bayes</td><td align="left" valign="top">0.703</td><td align="left" valign="top">0.694</td><td align="left" valign="top">0.708</td><td align="left" valign="top">0.708</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Random forest</td><td align="left" valign="top">0.716</td><td align="left" valign="top">0.510</td><td align="left" valign="top">0.839<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.833<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SVM</td><td align="left" valign="top">0.624</td><td align="left" valign="top">0.816<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.426</td><td align="left" valign="top">0.568</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.653</td><td align="left" valign="top">0.735</td><td align="left" valign="top">0.734</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GLMnet</td><td align="left" valign="top">0.700</td><td align="left" valign="top">0.625</td><td align="left" valign="top">0.715</td><td align="left" valign="top">0.714</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OFS</td><td align="left" valign="top">0.731<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.646</td><td align="left" valign="top">0.716</td><td align="left" valign="top">0.715</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>MI: myocardial infarction.</p></fn><fn id="table2fn2"><p><sup>b</sup>The best-performing model for each performance metric.</p></fn></table-wrap-foot></table-wrap><p>For classification of MI, the OCT model achieved the highest sensitivity at 0.782 with a specificity of 0.554 and AUC of 0.687. Random forest achieved the highest AUC at 0.716, with a sensitivity of 0.746, moderate specificity (0.640), and accuracy (0.645). XGBoost shows similar characteristics, with an AUC of 0.695, sensitivity matching OCT at 0.782, but lower specificity (0.554) and accuracy (0.565). GLMnet, despite its lower sensitivity (0.629), has a high AUC of 0.704 and the best specificity (0.666) and accuracy (0.664) among the models. Na&#x00EF;ve Bayes, while exhibiting lower sensitivity (0.563), has the highest specificity (0.768), though an accuracy of 0.420.</p><p>For stroke classification, our results indicate that na&#x00EF;ve Bayes provides the most balanced results across AUC (0.703), sensitivity (0.694), specificity (0.708), and accuracy (0.708). On the other hand, the Random forest model, despite having a lower sensitivity, achieves a high AUC (0.716) and the highest specificity (0.839), along with an accuracy of 0.833 &#x2013; indicating its strong performance in correctly identifying non-stroke cases at the expense of missing some stroke cases. Similarly, XGBoost performs exceptionally well on AUC (0.714), accuracy (0.734), and specificity (0.735), with a low sensitivity (0.653). In contrast, the OCT model has the second-lowest AUC (0.625) and accuracy (0.550), indicating it may be the least effective model for stroke prediction. This lower performance suggests that OCT might not be suitable for accurate stroke prediction compared with the other models considered.</p></sec><sec id="s3-3"><title>Fairness</title><p>In <xref ref-type="table" rid="table3">Table 3</xref>, we present AUC stratified by gender and race across all of the developed interpretable machine learning models for MI and stroke classification, along with the RPPS scores for gender and race. The conditional AUC on each subgroup is calculated by classification results in the test set.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Area under the curve and Relative Parity of Performance Scores of myocardial infarction and stroke for gender and race across all machine learning models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Machine learning model</td><td align="left" valign="top" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">RPPS<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top" colspan="2">AUC</td><td align="left" valign="top">RPPS</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Men</td><td align="left" valign="top">Women</td><td align="left" valign="top">Gender</td><td align="left" valign="top">Black</td><td align="left" valign="top">White</td><td align="left" valign="top">Race</td></tr></thead><tbody><tr><td align="left" valign="top">MI<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Na&#x00EF;ve Bayes</td><td align="left" valign="top">0.667</td><td align="left" valign="top">0.708</td><td align="left" valign="top">0.961</td><td align="left" valign="top">0.745</td><td align="left" valign="top">0.681</td><td align="left" valign="top">0.927</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SVM</td><td align="left" valign="top">0.558</td><td align="left" valign="top">0.675</td><td align="left" valign="top">0.838</td><td align="left" valign="top">0.605</td><td align="left" valign="top">0.591</td><td align="left" valign="top">0.959</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Random forest</td><td align="left" valign="top">0.673</td><td align="left" valign="top">0.760</td><td align="left" valign="top">0.939</td><td align="left" valign="top">0.770</td><td align="left" valign="top">0.704</td><td align="left" valign="top">0.925</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.655</td><td align="left" valign="top">0.753</td><td align="left" valign="top">0.917</td><td align="left" valign="top">0.720</td><td align="left" valign="top">0.690</td><td align="left" valign="top">0.964</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GLMnet</td><td align="left" valign="top">0.689</td><td align="left" valign="top">0.702</td><td align="left" valign="top">0.979<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">0.681</td><td align="left" valign="top">0.706</td><td align="left" valign="top">0.967<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OCT</td><td align="left" valign="top">0.642</td><td align="left" valign="top">0.763</td><td align="left" valign="top">0.889</td><td align="left" valign="top">0.710</td><td align="left" valign="top">0.682</td><td align="left" valign="top">0.967<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OFS</td><td align="left" valign="top">0.688</td><td align="left" valign="top">0.716</td><td align="left" valign="top">0.976</td><td align="left" valign="top">0.668</td><td align="left" valign="top">0.708</td><td align="left" valign="top">0.948</td></tr><tr><td align="left" valign="top">Stroke</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Na&#x00EF;ve Bayes</td><td align="left" valign="top">0.679</td><td align="left" valign="top">0.662</td><td align="left" valign="top">0.942</td><td align="left" valign="top">0.733</td><td align="left" valign="top">0.629</td><td align="left" valign="top">0.895</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SVM</td><td align="left" valign="top">0.622</td><td align="left" valign="top">0.615</td><td align="left" valign="top">0.986<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">0.600</td><td align="left" valign="top">0.609</td><td align="left" valign="top">0.962</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Random forest</td><td align="left" valign="top">0.783</td><td align="left" valign="top">0.673</td><td align="left" valign="top">0.906</td><td align="left" valign="top">0.741</td><td align="left" valign="top">0.696</td><td align="left" valign="top">0.965</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGBoost</td><td align="left" valign="top">0.773</td><td align="left" valign="top">0.645</td><td align="left" valign="top">0.903</td><td align="left" valign="top">0.718</td><td align="left" valign="top">0.702</td><td align="left" valign="top">0.983<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GLMnet</td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.727</td><td align="left" valign="top">0.961</td><td align="left" valign="top">0.685</td><td align="left" valign="top">0.698</td><td align="left" valign="top">0.979</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OCT</td><td align="left" valign="top">0.595</td><td align="left" valign="top">0.670</td><td align="left" valign="top">0.928</td><td align="left" valign="top">0.701</td><td align="left" valign="top">0.582</td><td align="left" valign="top">0.878</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OFS</td><td align="left" valign="top">0.687</td><td align="left" valign="top">0.787</td><td align="left" valign="top">0.923</td><td align="left" valign="top">0.692</td><td align="left" valign="top">0.743</td><td align="left" valign="top">0.947</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>RPPS: Relative Parity of Performance Scores.</p></fn><fn id="table3fn3"><p><sup>c</sup>MI: myocardial infarction.</p></fn><fn id="table3fn4"><p><sup>d</sup>The best-performing model for each performance metric.</p></fn></table-wrap-foot></table-wrap><p>For MI classification, besides the SVM, which has the lowest RPPS (0.838), the random forest model shows a substantial disparity in AUC between men (0.673) and women (0.760), resulting in a lower RPPS (0.939), which indicates higher gender disparities. Similarly, XGBoost exhibits a high AUC for women (0.753) compared with men (0.655), leading to an RPPS of 0.917, further highlighting the model&#x2019;s performance gaps across genders. On the other hand, the GLMnet and OCT model demonstrate the highest RPPS (0.979), suggesting minimal performance disparities between Black (0.681) and White (0.706) subgroups for GLMnet, and Black (0.710) and White (0.682) subgroups for OCT. In contrast, the random forest model shows greater differences in performance by race with an AUC of 0.770 for non-Hispanic Black people and 0.704 for non-Hispanic White people, resulting in a lower RPPS (0.925). Importantly, for most models, the RPPS scores indicate fairer results among race groups, compared with gender, for predicting MI events.</p><p>For stroke classification, the XGBoost model exhibits a significant difference in AUC between men (0.773) and women (0.645), resulting in a lower RPPS (0.903), highlighting pronounced differences in predictive accuracy by gender. Similarly, the random forest model, with a high AUC for men (0.783) compared with women (0.673), leads to an RPPS of 0.906. Conversely, the XGBoost model shows a relatively high RPPS (0.983), implying minimal performance differences between non-Hispanic Black (0.718) and non-Hispanic White (0.702) people. However, the na&#x00EF;ve Bayes model demonstrates greater differences by race with an AUC of 0.733 for non-Hispanic Black people and 0.629 for non-Hispanic White people, resulting in a lower RPPS (0.895). Importantly, in contrast to our findings in MI classification tasks, the RPPS scores reveal better performance parity among gender groups compared with racial groups in classifying stroke events.</p><p>We also analyzed the coefficients of the one-hot encoded gender and race variable for GLMnet. For the variable &#x201C;female=1&#x201D;, coefficient values were -0.324 in the model for MI and -0.250 in the model for stroke, suggesting that women are associated with a lower predicted risk of MI and stroke compared with men. For race, the variable &#x201C;black=1&#x201D; had a coefficient value of -0.192 in the model for MI, which indicates a lower predicted risk of MI for non-Hispanic Black people compared with non-Hispanic White people. For stroke, the model does not provide a coefficient for this predictor, indicating it was not significant in this context. These findings highlight the potential disparities in model predictions based on demographic factors, emphasizing the need to consider these variables when developing and evaluating predictive models in CVD classification.</p></sec><sec id="s3-4"><title>Model Selection</title><p>We applied our model selection approach to identify the most preferable predictive models for MI and stroke classification. From our analyses, we have found that these models can exhibit higher disparities across gender subgroups, compared with race. Consequently, alongside overall predictive performance (overall accuracy), we incorporated gender RPPS scores into our model selection criteria. The weighted sum of accuracy and RPPS for MI and stroke classification among all machine learning models is presented in <xref ref-type="fig" rid="figure3">Figure 3</xref>. Recall that, when the weight is selected to be 0, this weighted sum represents RPPS; and when the weight is 1, it represents accuracy. For MI classification, the GLMnet model consistently demonstrated the highest values across most weights, indicating its superior balance of accuracy and fairness (Gender RPPS). This is particularly evident as the weight approaches 0, with a gender RPPS of 0.974. On the other hand, for stroke classification, the best-performing model varies across different weights. At lower weights (0 to 0.2), the OCT model showed the highest values due to its strong Gender RPPS. As the weight increases, the GLMnet model starts to dominate from the weight is 0.25 to 0.6, maintaining a good balance between accuracy and Gender RPPS. Moving to higher weights (0.65 onwards), the random forest model outperformed others, with its accuracy dominating the performance at around 0.788. These results indicated that, for stroke classification, when users prioritize model fairness, the OCT model is the best. For users who are neutral regarding the tradeoff between fairness and accuracy, GLMnet emerges as a suitable option. Lastly, random forest is recommended for users with a strong preference for model accuracy. Accordingly, we assumed users want to balance accuracy and fairness and proceed to analyze the relationship between variables and outcomes using GLMnet for both MI and stroke classification.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Weighted sum of model accuracy and gender Relative Parity of Performance Scores for myocardial infarction and stroke. MI: myocardial infarction; RPPS: Relative Parity of Performance Scores.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66200_fig03.png"/></fig></sec><sec id="s3-5"><title>Model Explanation</title><p>We analyzed the relationship between features and outcomes for predicting MI and stroke using GLMnet, with our synergistic model explanation approach that integrates the permutation variable importance, the SHAP method, and the partial dependence plots. <xref ref-type="fig" rid="figure4">Figures 4</xref> and <xref ref-type="fig" rid="figure5">5</xref> display the permutation variable importance measures, along with their 95% CIs, and the SHAP method for the GLMnet model in predicting MI and stroke, respectively. In addition, partial dependence plots for GLMnet for MI and stroke are shown in <xref ref-type="fig" rid="figure6">Figures 6</xref> and <xref ref-type="fig" rid="figure7">7</xref>, respectively.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Permutation variable importance and Shapley Additive Explanations method for GLMnet on myocardial infarction classification. AUC: area under the curve; MI: myocardial infarction; SHAP: Shapley Additive Explanations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66200_fig04.png"/></fig><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Permutation variable importance and Shapley Additive Explanations method for GLMnet on stroke classification. CVD: cardiovascular disease; HbA<sub>1c</sub>: hemoglobin A1c.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66200_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Partial dependence plots for GLMnet on myocardial infarction classification. AUC: area under the curve; SHAP: Shapley Additive Explanations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66200_fig06.png"/></fig><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Partial dependence plots for GLMnet on stroke classification.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66200_fig07.png"/></fig><p>From <xref ref-type="fig" rid="figure4">Figure 4</xref>, we observe that the permutation variable importance technique highlights the history of CVD (cvd_hx_baseline) as the most critical predictor for GLMnet to accurately predicting MI risks. Notably, its mean importance value (0.078) is nearly twelve times greater than that of the second most important variable, age (baseline_age), which has a mean importance value of 0.006. Moreover, while the 95% CIs of the permutation importance for CVD history is (0.045, 0.107), which is among the widest in the feature space, its lower confidence interval is still higher than the upper confidence interval of any other features. These results imply the significant role of people&#x2019;s CVD history to make accurate classification using GLMnet. In addition, BMI, and insulin treatment (insulinrx) also rank highly in this importance metric, with mean importance values of 0.002 and 0.0013, respectively. Some variables show negative values in permutation importance, which could be related to overfitting. Since the negative values are small in magnitude in our case, it could be that the model has relied too heavily on noise rather than true signal for those predictors. Now, from the SHAP method, we observed that the history of CVD and age emerge as the 2 most influential features in driving GLMnet&#x2019;s MI classification, as indicated by their average relative Shapley values of 0.45 and 0.14, respectively. This suggests that these 2 features have a general tendency to positively contribute to the prediction value. However, the individual relative Shapley values for the features can widely range from negative to positive, reflecting the varying marginal contributions of the features across individuals. For example, it appears that the distribution of the relative Shapley values for CVD history is a mixture of 2 highly separated distributions, which again implies that the GLMnet is substantially sensitive to CVD history. Furthermore, our findings indicate that although insulin treatment is highlighted as one of the top features in permutation variable importance, its mean relative Shapley value is relatively low compared with other features. Conversely, HbA<sub>1c</sub> ranks highly using the SHAP method. Since both insulin treatment and HbA<sub>1c</sub> are indicative of an individual&#x2019;s diabetic status, this suggests that accurately predicting MI also heavily depends on the diabetes status of the individual. Consequently, we next analyze the partial dependence plots for CVD history, age, and HbA<sub>1c</sub> to draw actionable insights.</p><p>The partial dependence plots in <xref ref-type="fig" rid="figure6">Figure 6</xref> illustrate that individuals with a history of CVD can have log odds of 0.87 (risk of 0.7), while those without CVD can have log odds as low as &#x2212;1.94 (risk of 0.13). In addition, the plots show that 44-year-old individuals in the test set have log-odds of &#x2212;0.44 (risk of 0.4) for developing MI. As age increases to 79 years, the risk gradually rises to log-odds of &#x2212;0.33 (risk of 0.42). Furthermore, the plots reveal that HbA<sub>1c</sub> levels significantly impact risk: with HbA<sub>1c</sub> as low as 4.6%, the log-odds are &#x2212;0.94 (risk of 0.3), but as HbA<sub>1c</sub> rises to 12.7%, the log-odds increase drastically to 0.55 (risk of 0.61). These results highlight the importance of considering both CVD history and key biomarkers such as HbA<sub>1c</sub> in assessing MI risk. Notably, although partial dependence plots do not necessarily reveal causation between features and risks, the steep increase in risk associated with higher HbA<sub>1c</sub> levels underscores the critical role of diabetes management in preventing MI.</p><p>Next, we conduct a model explanation analysis for GLMnet in predicting stroke. As shown in <xref ref-type="fig" rid="figure7">Figure 7</xref>, HbA<sub>1c</sub>, systolic blood pressure (SBP), and heart rate (HR) have the highest mean permutation importance values (0.08, 0.06, and 0.01, respectively). Other significant features include serum creatinine (screat), age, and BMI. These features exhibit wide confidence intervals for their permutation importance values, suggesting that their influence on the model can vary depending on the patient cohort. Moreover, HbA<sub>1c</sub> and SBP consistently show the highest positive marginal contributions to risk predictions, with average relative Shapley values of 0.28 and 0.26, respectively. Heart rate, with a relative Shapley value of 0.05, is also a significant predictor for stroke using GLMnet. Although there are instances where these features negatively contribute to risk prediction, their relative Shapley values are predominantly positive. This distribution indicates a general tendency for these features to positively drive the predicted values, highlighting their crucial role in the model&#x2019;s stroke risk predictions. Since these features are essential for accurately assessing stroke risk, we now derive actionable insights from the partial dependence plots (<xref ref-type="fig" rid="figure7">Figure 7</xref>).</p><p>The partial dependence plots in <xref ref-type="fig" rid="figure7">Figure 7</xref> provide visualizations of how HbA<sub>1c</sub>, systolic blood pressure, and heart rate influence stroke classification using the GLMnet model. We noticed that all 3 features exhibit a clear positive relationship between them and the predicted risk of stroke. For HbA<sub>1c</sub>, as levels increase from 5.5% to 13%, there is a noticeable rise in both log-odds and predicted risk, moving from around &#x2212;0.95 to 0.94 in log-odds and from 0.28 to 0.71 in predicted risk. For systolic blood pressure, the log-odds of developing stroke can reach 1.01 from &#x2212;1.05 and from 0.26 to 0.725 in predicted risk as systolic blood pressure increases from 88 mm Hg to 212 mm Hg. Notably, the rate of increase in stroke risk with rising systolic blood pressure is less steep compared with the rate observed with increasing HbA<sub>1c</sub> levels. Finally, the log-odds of developing stroke can reach 0.41 from -0.61 and from 0.35 to 0.59 in predicted risk as heart rate increases from 39 bpm to 132 bpm. Comparatively, the rate of increase in risk with heart rate is the most moderate among these features. This indicates that managing HbA<sub>1c</sub> and systolic blood pressure could be more effective in preventing stroke. Overall, our model explanation analyses highlight the importance of managing blood glucose levels, blood pressure, and heart rate to mitigate the risk of stroke.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings and Comparison With Previous Works</title><p>In this study, we designed a responsible framework that evaluates various machine learning models by comparing these models&#x2019; predictive accuracy and fairness metrics, while also providing model explanation. We then applied this framework for classification of MI and stroke, demonstrating its effectiveness in highlighting the importance of in-depth analyses of interpretable machine learning models between these 3 dimensions.</p><p>Our results demonstrate that complex models are not necessarily always better than simple, interpretable models&#x2014;especially for high-stakes decisions such as those encountered in medicine [<xref ref-type="bibr" rid="ref30">30</xref>]. Importantly. while the investigation of predictive accuracy have been a focus of several previous studies in health care AI, the concurrent evaluation of fairness in machine learning models is inconsistent and lacking [<xref ref-type="bibr" rid="ref12">12</xref>]. The fairness analysis of our predictive models reveals both strengths and areas of concern in terms of gender and racial bias. In MI prediction, while some models like the SVM and XGBoost demonstrate gender biases, others such as GLMnet and OFS models show more balanced performance across gender and racial groups. This indicates that the most accurate models are not necessarily the fairest, and vice versa. These trade-offs highlight the importance of carefully selecting and tuning models to balance accuracy and fairness in medical decision-making. For stroke prediction, while some models, that are, GLMnet and SVM, exhibit balanced performance for both gender and race, other models, that are, XGBoost, present bias for a gender. This finding highlights the complex nature of fairness in such models, where different types of biases may manifest depending on the outcome being predicted and the model used. These findings underscore the importance of continuously monitoring and evaluating models for fairness. Our study provides a comprehensive analysis of fairness across different models and conditions, paving the way for more equitable AI applications in health care.</p><p>While there are multiple important criteria for model selection, such as predictive performance and fairness, a unified approach to guide this selection process is lacking. To address this, our analysis presents a sensitivity analysis based selection procedure based on users&#x2019; preferences over the selection criteria. Our findings suggest that in some prediction tasks, users&#x2019; preferences have minimal effect on the best-performing model (eg, GLMnet for MI prediction). However, in other tasks, for example, stroke prediction, users' preferences can significantly influence the selected model. Our study highlights the necessity of considering user preferences in model selection to ensure optimal outcomes for different prediction tasks.</p><p>Finally, our study demonstrates the potential role that our integrated explanation method (ie, the combination of permutation variable importance, the SHAP method, and partial dependence plots) can play in enhancing clinicians&#x2019; understanding and trust of model-based predictions. For instance, the permutation variable importance measures, the Shapley values and partial dependence plots provide a clear visual representation of how key features like HbA<sub>1c</sub> and systolic blood pressure influence the model&#x2019;s risk prediction for MI and Stroke. Such visual explanations can provide actionable insights and be augmented with existing clinical knowledge to help validate the quality of model-generated risk estimates [<xref ref-type="bibr" rid="ref31">31</xref>]. These visualizations can also help clinical experts explain the unknown complex relationships between various risk factors and adverse outcomes [<xref ref-type="bibr" rid="ref32">32</xref>].</p><p>This study has some limitations. The proposed framework was tested solely on CVD classification using the ACCORD dataset. While it has shown promise in this context, its effectiveness in other disease areas needs further investigation. Moreover, we relied on baseline information collected at the start of the study for model development and did not fully account for how things can change over time in the real world. Looking ahead, we would like to make sure our models stay accurate and up to date as treatment strategies and clinical guidelines evolve. One way to do this is by applying our framework on more recent datasets or on data that captures changes over time. These steps could give us a better understanding of how changing clinical practices might affect prediction modeling.</p></sec><sec id="s4-2"><title>Conclusions</title><p>In this research, we proposed a 3-stage responsible framework for developing, selecting, and explaining machine learning models, emphasizing the trade-off between predictive accuracy and fairness in health care applications. By quantifying this trade-off using AUC and RPPS, we provided a structured approach to responsible model selection. After selecting the final model, we proposed an integrated explanation method to offer insights into the relationships between features and outcomes. Applying this framework to predict MI and stroke among people with T2D, we demonstrated its effectiveness and potential to improve the development and evaluation of machine learning models for clinical practice. We anticipate that our framework is generalizable and can be applied to other clinical prediction tasks, potentially increasing the trustworthiness and acceptance of machine learning models among clinicians and patients.</p><p>Our framework highlights the importance of combining interpretability, explainability, and fairness in building, selecting, and explaining machine learning models. This integration is crucial not only for enhancing model performance [<xref ref-type="bibr" rid="ref33">33</xref>], but also for addressing ethical and legal considerations [<xref ref-type="bibr" rid="ref34">34</xref>]. These principles both help verify model results against clinical literature [<xref ref-type="bibr" rid="ref35">35</xref>], and fosters acceptance and trust among health care stakeholders [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. By fully embracing these aspects, our framework paves the way for more responsible, ethical, and transparent AI applications in health care.</p></sec></sec></body><back><ack><p>This work was supported by the AIM-AHEAD Coordinating Center, funded by NIH. This manuscript was prepared using the ACCORD Clinical Research Materials obtained from the NHLBI Biologic Specimen and Data Repository Information Coordinating Center and does not necessarily reflect the opinions or views of the ACCORD or the NHLBI.</p><p>The authors would like to thank Dr. James Washington from the Morehouse School of Medicine for his insightful feedback on earlier versions of the manuscript.</p><p>This research was further supported by Georgia Institute of Technology, Texas A&#x0026;M University, Emory University, AIM-AHEAD Coordinating Center, award number OTA-21-017, and was partially funded by the National Institutes of Health Agreement No. 1OT2OD032581.</p><p>MBW and FJP received funding from the Georgia Center for Diabetes Translation Research under Award No. NIH/NIDDK P30DK111024. YY received funding from the Georgia Tech Presidential Undergraduate Research Award.</p></ack><notes><sec><title>Data Availability</title><p>The datasets analyzed during this study are available in the BioLINCC repository under Accession Number: HLB01041317a, ClinicalTrials.gov Identifier: NCT00000620</p></sec></notes><fn-group><fn fn-type="con"><p>GPG and EK conceptualized the research in this manuscript and supervised its execution. YY and CL led the data curation and formal analysis with input on data sources from FJP, MBW, and HS. GPG, EK, and HS led the funding acquisition. GPG directed the methodological approach with significant input from EK, CL, and HS. GPG, EK, and HS were responsible for project administration and resources. YY and CL developed and implemented the code used for model development and numerical validation. FJP, MBW, and HS provided significant input on the clinical interpretation of results. YY and CL generated all tables and figures in the manuscript. Manuscript writing was led by YY, CL, EK, and GPG with significant input from FJP, MBW, and HS. All authors reviewed the manuscript.</p></fn><fn fn-type="conflict"><p>FJP reports research support from Dexcom, Insulet, Novo Nordisk, Tandem and Ideal Medical Technologies and consulting for Dexcom and Medscape. All other authors have no competing interests to report.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ACCORD</term><def><p>Action to Control Cardiovascular Risk in Diabetes study</p></def></def-item><def-item><term id="abb2">AI</term><def><p>Artificial Intelligence</p></def></def-item><def-item><term id="abb3">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb4">CVD</term><def><p>cardiovascular disease</p></def></def-item><def-item><term id="abb5">HbA<sub>1c</sub></term><def><p>Hemoglobin A1c</p></def></def-item><def-item><term id="abb6">MI</term><def><p>myocardial infarction</p></def></def-item><def-item><term id="abb7">NHLBI</term><def><p>National Heart, Lung, and Blood Institute</p></def></def-item><def-item><term id="abb8">RPPS</term><def><p>Relative Parity of Performance Scores</p></def></def-item><def-item><term id="abb9">SHAP</term><def><p>Shapley Additive Explanations</p></def></def-item><def-item><term id="abb10">T2D</term><def><p>type 2 diabetes</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bertsimas</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wiberg</surname><given-names>H</given-names> </name></person-group><article-title>Machine learning in Oncology: methods, applications, and challenges</article-title><source>JCO Clin Cancer Inform</source><year>2020</year><month>10</month><volume>4</volume><fpage>885</fpage><lpage>894</lpage><pub-id pub-id-type="doi">10.1200/CCI.20.00072</pub-id><pub-id pub-id-type="medline">33058693</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bravo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rudin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shaposhnik</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Interpretable prediction rules for congestion risk in intensive care units</article-title><source>Stochastic Systems</source><pub-id pub-id-type="doi">10.1287/stsy.2022.0018</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Holzinger</surname><given-names>A</given-names> </name><name name-style="western"><surname>Biemann</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pattichis</surname><given-names>CS</given-names> </name><etal/></person-group><article-title>What do we need to build explainable AI systems for the medical domain?</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 28, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1712.09923</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Garcia</surname><given-names>GGP</given-names> </name><etal/></person-group><article-title>Designing interpretable machine learning models using mixed integer programming</article-title><source>Encyclopedia of Optimization</source><year>2020</year><publisher-name>Springer International Publishing</publisher-name><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-54621-2_867-1</pub-id><pub-id pub-id-type="other">978-3-030-54621-2</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rudin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Interpretable machine learning: fundamental principles and 10 grand challenges</article-title><source>Statist Surv</source><year>2022</year><month>01</month><volume>16</volume><issue>none</issue><fpage>1</fpage><lpage>85</lpage><pub-id pub-id-type="doi">10.1214/21-SS133</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><article-title>Interpretable AI documentation</article-title><source>Interpretable AI L</source><year>2023</year><access-date>2025-05-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.interpretable.ai">https://www.interpretable.ai</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ustun</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rudin</surname><given-names>C</given-names> </name></person-group><article-title>Learning optimized risk scores</article-title><source>J Mach Learn Res</source><year>2019</year><volume>20</volume><issue>150</issue><fpage>1</fpage><lpage>75</lpage></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barredo Arrieta</surname><given-names>A</given-names> </name><name name-style="western"><surname>D&#x00ED;az-Rodr&#x00ED;guez</surname><given-names>N</given-names> </name><name name-style="western"><surname>Del Ser</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Explainable artificial intelligence (XAI): concepts, taxonomies, opportunities and challenges toward responsible AI</article-title><source>Information Fusion</source><year>2020</year><month>06</month><volume>58</volume><fpage>82</fpage><lpage>115</lpage><pub-id pub-id-type="doi">10.1016/j.inffus.2019.12.012</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Blasimme</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vayena</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Explainability for artificial intelligence in healthcare: a multidisciplinary perspective</article-title><source>BMC Med Inform Decis Mak</source><year>2020</year><month>11</month><day>30</day><volume>20</volume><issue>1</issue><fpage>310</fpage><pub-id pub-id-type="doi">10.1186/s12911-020-01332-6</pub-id><pub-id pub-id-type="medline">33256715</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zihni</surname><given-names>E</given-names> </name><name name-style="western"><surname>Madai</surname><given-names>VI</given-names> </name><name name-style="western"><surname>Livne</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Opening the black box of artificial intelligence for clinical decision support: a study predicting stroke outcome</article-title><source>PLoS ONE</source><year>2020</year><volume>15</volume><issue>4</issue><fpage>e0231166</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0231166</pub-id><pub-id pub-id-type="medline">32251471</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehrabi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Morstatter</surname><given-names>F</given-names> </name><name name-style="western"><surname>Saxena</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A survey on bias and fairness in machine learning</article-title><source>ACM Comput Surv</source><year>2022</year><month>07</month><day>31</day><volume>54</volume><issue>6</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3457607</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Galal</surname><given-names>G</given-names> </name><name name-style="western"><surname>Etemadi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of racial bias in clinical machine learning models: scoping review</article-title><source>JMIR Med Inform</source><year>2022</year><month>05</month><day>31</day><volume>10</volume><issue>5</issue><fpage>e36388</fpage><pub-id pub-id-type="doi">10.2196/36388</pub-id><pub-id pub-id-type="medline">35639450</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gianfrancesco</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Tamang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yazdany</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Potential biases in machine learning algorithms using electronic health record data</article-title><source>JAMA Intern Med</source><year>2018</year><month>11</month><day>1</day><volume>178</volume><issue>11</issue><fpage>1544</fpage><lpage>1547</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2018.3763</pub-id><pub-id pub-id-type="medline">30128552</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McCradden</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mazwi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Ethical limitations of algorithmic fairness solutions in health care machine learning</article-title><source>Lancet Digit Health</source><year>2020</year><month>05</month><volume>2</volume><issue>5</issue><fpage>e221</fpage><lpage>e223</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(20)30065-0</pub-id><pub-id pub-id-type="medline">33328054</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gerstein</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Riddle</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Kendall</surname><given-names>DM</given-names> </name><etal/></person-group><article-title>Glycemia treatment strategies in the action to control cardiovascular risk in diabetes (ACCORD) trial</article-title><source>Am J Cardiol</source><year>2007</year><month>06</month><day>18</day><volume>99</volume><issue>12A</issue><fpage>34i</fpage><lpage>43i</lpage><pub-id pub-id-type="doi">10.1016/j.amjcard.2007.03.004</pub-id><pub-id pub-id-type="medline">17599423</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Obermeyer</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Powers</surname><given-names>B</given-names> </name><name name-style="western"><surname>Vogeli</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Dissecting racial bias in an algorithm used to manage the health of populations</article-title><source>Science</source><year>2019</year><month>10</month><day>25</day><volume>366</volume><issue>6464</issue><fpage>447</fpage><lpage>453</lpage><pub-id pub-id-type="doi">10.1126/science.aax2342</pub-id><pub-id pub-id-type="medline">31649194</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>I</given-names> </name><name name-style="western"><surname>Johansson</surname><given-names>FD</given-names> </name><name name-style="western"><surname>Sontag</surname><given-names>D</given-names> </name></person-group><article-title>Why is my classifier discriminatory?</article-title><year>2018</year><access-date>2025-06-13</access-date><conf-name>Advances in Neural Information Processing Systems 31 (NeurIPS 2018)</conf-name><conf-loc>Montreal, Canada</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://papers.nips.cc/paper_files/paper/2018/hash/1f1baa5b8edac74eb4eaa329f14a0361-Abstract.html">https://papers.nips.cc/paper_files/paper/2018/hash/1f1baa5b8edac74eb4eaa329f14a0361-Abstract.html</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shameer</surname><given-names>K</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><etal/></person-group><article-title>Machine learning in cardiovascular medicine: are we there yet?</article-title><source>Heart</source><year>2018</year><month>07</month><volume>104</volume><issue>14</issue><fpage>1156</fpage><lpage>1164</lpage><pub-id pub-id-type="doi">10.1136/heartjnl-2017-311198</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Griffin</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Care settings of transient ischemic attack in the United States: a cohort study from the TriNetX health research network</article-title><source>J Stroke Cerebrovasc Dis</source><year>2024</year><month>09</month><volume>33</volume><issue>9</issue><fpage>107888</fpage><pub-id pub-id-type="doi">10.1016/j.jstrokecerebrovasdis.2024.107888</pub-id><pub-id pub-id-type="medline">39067658</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burke</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Hayward</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Nelson</surname><given-names>JP</given-names> </name><etal/></person-group><article-title>Using internally developed risk models to assess heterogeneity in treatment effects in clinical trials</article-title><source>Circ Cardiovasc Qual Outcomes</source><year>2014</year><month>01</month><volume>7</volume><issue>1</issue><fpage>163</fpage><lpage>169</lpage><pub-id pub-id-type="doi">10.1161/CIRCOUTCOMES.113.000497</pub-id><pub-id pub-id-type="medline">24425710</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hastie</surname><given-names>T</given-names> </name><name name-style="western"><surname>Friedman</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tibshirani</surname><given-names>R</given-names> </name></person-group><source>The Elements of Statistical Learning</source><year>2001</year><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-0-387-21606-5</pub-id><pub-id pub-id-type="other">978-1-4899-0519-2</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Linardatos</surname><given-names>P</given-names> </name><name name-style="western"><surname>Papastefanopoulos</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kotsiantis</surname><given-names>S</given-names> </name></person-group><article-title>Explainable AI: a review of machine learning interpretability methods</article-title><source>Entropy (Basel)</source><year>2020</year><month>12</month><day>25</day><volume>23</volume><issue>1</issue><fpage>18</fpage><pub-id pub-id-type="doi">10.3390/e23010018</pub-id><pub-id pub-id-type="medline">33375658</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lundberg</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Erion</surname><given-names>G</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Explainable AI for trees: from local explanations to global understanding</article-title><comment>Preprint posted online on 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1905.04610</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bertsimas</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pauphilet</surname><given-names>J</given-names> </name><name name-style="western"><surname>Van Parys</surname><given-names>B</given-names> </name></person-group><article-title>Sparse classification: a scalable discrete optimization perspective</article-title><source>Mach Learn</source><year>2021</year><month>12</month><volume>110</volume><issue>11-12</issue><fpage>3177</fpage><lpage>3209</lpage><pub-id pub-id-type="doi">10.1007/s10994-021-06085-5</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bertsimas</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dunn</surname><given-names>J</given-names> </name></person-group><article-title>Optimal classification trees</article-title><source>Mach Learn</source><year>2017</year><month>07</month><volume>106</volume><issue>7</issue><fpage>1039</fpage><lpage>1082</lpage><pub-id pub-id-type="doi">10.1007/s10994-017-5633-9</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garcia</surname><given-names>GGP</given-names> </name><name name-style="western"><surname>Lavieri</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Data-driven stochastic optimization approaches to determine decision thresholds for risk estimation models</article-title><source>IISE Transactions</source><year>2020</year><month>10</month><day>2</day><volume>52</volume><issue>10</issue><fpage>1098</fpage><lpage>1121</lpage><pub-id pub-id-type="doi">10.1080/24725854.2020.1725254</pub-id><pub-id pub-id-type="medline">29963653</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Random forests</article-title><source>Mach Learn</source><year>2001</year><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Goldstein</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kapelner</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bleich</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Peeking inside the black box: visualizing statistical learning with plots of individual conditional expectation</article-title><comment>Preprint posted online on 2014</comment><pub-id pub-id-type="doi">10.48550/arXiv.1309.6392</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Molnar</surname><given-names>C</given-names> </name></person-group><source>96 SHAP (SHapley Additive exPlanations) | Interpretable Machine Learning</source><access-date>2025-01-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://christophm.github.io/interpretable-ml-book/shap.html">https://christophm.github.io/interpretable-ml-book/shap.html</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rudin</surname><given-names>C</given-names> </name></person-group><article-title>Stop explaining black box machine learning models for high stakes decisions and use interpretable models instead</article-title><source>Nat Mach Intell</source><year>2019</year><volume>1</volume><issue>5</issue><fpage>206</fpage><lpage>215</lpage><pub-id pub-id-type="doi">10.1038/s42256-019-0048-x</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schwartz</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Moy</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Rossetti</surname><given-names>SC</given-names> </name><etal/></person-group><article-title>Clinician involvement in research on machine learning&#x2013;based predictive clinical decision support for the hospital setting: a scoping review</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>03</month><day>1</day><volume>28</volume><issue>3</issue><fpage>653</fpage><lpage>663</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa296</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waljee</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>BI</given-names> </name><name name-style="western"><surname>Cohen-Mekelburg</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Development and validation of machine learning models in prediction of remission in patients with moderate to severe Crohn disease</article-title><source>JAMA Netw Open</source><year>2019</year><month>05</month><day>3</day><volume>2</volume><issue>5</issue><fpage>e193721</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2019.3721</pub-id><pub-id pub-id-type="medline">31074823</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bento</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kohler</surname><given-names>M</given-names> </name><name name-style="western"><surname>Diaz</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Improving deep learning performance by using explainable artificial intelligence (XAI) approaches</article-title><source>Discov Artif Intell</source><year>2021</year><month>12</month><volume>1</volume><issue>1</issue><fpage>9</fpage><pub-id pub-id-type="doi">10.1007/s44163-021-00008-y</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goodman</surname><given-names>B</given-names> </name><name name-style="western"><surname>Flaxman</surname><given-names>S</given-names> </name></person-group><article-title>European Union regulations on algorithmic decision making and a &#x201C;Right to Explanation&#x201D;</article-title><source>AI Mag</source><year>2017</year><month>09</month><volume>38</volume><issue>3</issue><fpage>50</fpage><lpage>57</lpage><pub-id pub-id-type="doi">10.1609/aimag.v38i3.2741</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Militello</surname><given-names>C</given-names> </name><name name-style="western"><surname>Prinzi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Sollami</surname><given-names>G</given-names> </name><etal/></person-group><article-title>CT radiomic features and clinical biomarkers for predicting coronary artery disease</article-title><source>Cogn Comput</source><year>2023</year><month>01</month><volume>15</volume><issue>1</issue><fpage>238</fpage><lpage>253</lpage><pub-id pub-id-type="doi">10.1007/s12559-023-10118-7</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lipton</surname><given-names>ZC</given-names> </name></person-group><article-title>The Mythos of Model Interpretability: In machine learning, the concept of interpretability is both important and slippery</article-title><source>Queue</source><year>2018</year><month>06</month><day>1</day><volume>16</volume><issue>3</issue><fpage>31</fpage><lpage>57</lpage><pub-id pub-id-type="doi">10.1145/3236386.3241340</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Keyvanshokooh</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zhalechian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Contextual learning with online convex optimization: theory and application to medical decision-making</article-title><source>Manage Sci</source><year>2025</year><pub-id pub-id-type="doi">10.1287/mnsc.2019.03211</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Code repository.</p><media xlink:href="medinform_v13i1e66200_app1.zip" xlink:title="ZIP File, 1376 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Details of model descriptions and hyperparameters, along with explanation methods.</p><media xlink:href="medinform_v13i1e66200_app2.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material></app-group></back></article>