<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e75655</article-id><article-id pub-id-type="doi">10.2196/75655</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Enhancing Model Generalizability in Medical Artificial Intelligence: Systematic Comparison of Categorical Encoding and Sampling Techniques for Imbalanced Data</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Chuang</surname><given-names>Chien-wei</given-names></name><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Wu</surname><given-names>Chung-Kuan</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Chao-Hsin</given-names></name><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shia</surname><given-names>Ben-Chang</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Chen</surname><given-names>Mingchih</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Graduate Institute of Business Administration, Fu Jen Catholic University</institution><addr-line>510 Zhongzheng Rd, Xinzhuang District</addr-line><addr-line>New Taipei City</addr-line><country>Taiwan</country></aff><aff id="aff2"><institution>Artificial Intelligence Development Center, Fu Jen Catholic University</institution><addr-line>New Taipei City</addr-line><country>Taiwan</country></aff><aff id="aff3"><institution>Division of Nephrology, Shin Kong Wu Ho-Su Memorial Hospital</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff4"><institution>Dialysis Access Management Center, Shin Kong Wu Ho-Su Memorial Hospital</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><aff id="aff5"><institution>School of Medicine, Fu Jen Catholic University</institution><addr-line>New Taipei City</addr-line><country>Taiwan</country></aff><aff id="aff6"><institution>Division of Digital Informatics Management, Department of Digital Medicine, Shin Kong Wu Ho-Su Memorial Hospital</institution><addr-line>Taipei</addr-line><country>Taiwan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Klann</surname><given-names>Jeffrey</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Velmurugan</surname><given-names>P</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lin</surname><given-names>Pei-Chun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kao</surname><given-names>Yi-Wei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mingchih Chen, Prof Dr, Graduate Institute of Business Administration, Fu Jen Catholic University, 510 Zhongzheng Rd, Xinzhuang District, New Taipei City, 242062, Taiwan, 886 2905-3895; <email>081438@mail.fju.edu.tw</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>13</day><month>4</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e75655</elocation-id><history><date date-type="received"><day>08</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Chien-wei Chuang, Chung-Kuan Wu, Chao-Hsin Wu, Ben-Chang Shia, Mingchih Chen. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 13.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e75655"/><abstract><sec><title>Background</title><p>Despite the increasing use of machine learning (ML) in clinical research, the early stages of data preparation, especially for structured clinical data, often receive limited methodological scrutiny. These datasets typically contain missing values, complex categorical variables, and imbalanced class distributions, all of which complicate downstream model development and interpretation.</p></sec><sec><title>Objective</title><p>This study introduces a structured preprocessing framework designed to address common challenges in medical tabular data and to assess how preprocessing choices affect the stability and portability of predictive models across settings.</p></sec><sec sec-type="methods"><title>Methods</title><p>We constructed a modular workflow comprising 3 components. First, preprocessing strategies included imputation for missing data, 3 types of categorical encoding (one-hot, frequency, and target), and resampling approaches for class imbalance (Synthetic Minority Over-sampling Technique [SMOTE] and Random Over Sampling Example [ROSE]). Second, 6 classification algorithms were used to evaluate performance patterns, including logistic regression (LGR), decision tree (DT), random forest, XGBoost (XGB), CatBoost (CAT), and light gradient-boosting machine (LightGBM). Third, we assessed cross-dataset portability using 2 datasets with distinct data-generating mechanisms: a registry for patients with end-stage renal disease (ESRD; n=412) and the population-based Behavioral Risk Factor Surveillance System (BRFSS) 2015 survey. For each dataset, we independently cleaned, standardized, encoded, tuned, and evaluated models using the same predefined hyperparameter search space, without cross-dataset feature matching or pooling the area under the ROC curve (AUC) calculations; the complete pipeline was then rerun on BRFSS as an external replication.</p></sec><sec sec-type="results"><title>Results</title><p>One-hot encoding in combination with ROSE yielded the most consistent performance improvements in terms of AUC (0.940) and accuracy (0.932), particularly for classifiers sensitive to class distribution. Notably, ROSE enhanced sensitivity without substantially distorting the original data structure. Feature importance rankings also contributed to model interpretability, and performance trends were largely reproducible in cross-context application.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our findings suggest that preprocessing decisions often treated as ancillary play a central role in shaping model outcomes, especially in high-variance clinical datasets. The proposed framework offers a reproducible and adaptable tool for aligning data preparation with the unique demands of health care prediction tasks and may serve as a foundation for future efforts to standardize preprocessing in clinical ML workflows.</p></sec></abstract><kwd-group><kwd>machine learning</kwd><kwd>data preprocessing</kwd><kwd>clinical prediction models</kwd><kwd>medical informatics</kwd><kwd>feature engineering</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Although health care institutions generate vast volumes of health and clinical tabular data on a daily basis, these valuable resources frequently encounter significant bottlenecks when being translated into effective predictive models [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. In practice, suboptimal model performance is often not attributable to the limitations of the algorithms themselves but rather to the lack of consistency and strategic planning in the data preprocessing pipeline [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Prior studies have demonstrated that various preprocessing techniques such as data imputation, normalization, and feature selection are highly sensitive in their impact on model performance [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>], and may even introduce bias or lead to overfitting [<xref ref-type="bibr" rid="ref7">7</xref>]. Consequently, a critical yet insufficiently addressed question arises: Can we establish a generalizable and reproducible data processing framework that assists users in selecting appropriate machine learning (ML) predictive models?</p><p>In the domain of medical informatics, ML models have been extensively used for predictive analyses of health and clinical data. However, extant literature has predominantly focused on comparing algorithmic accuracy and model performance [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], while offering limited systematic investigation into critical data processing steps such as data cleaning, feature encoding, and handling of class imbalance [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. This oversight is particularly consequential given the inherent characteristics of medical datasets, which frequently exhibit high rates of missing values, numerous categorical variables, and severely imbalanced outcome distributions&#x2014;factors that significantly influence predictive outcomes depending on the preprocessing choices made [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>Despite the enormous volume of clinical and health data generated daily by health care institutions, transforming these data into effective predictive models remains challenging due to pervasive issues such as high missingness, class imbalance, and the need for robust variable encoding. Although prior studies have examined individual procedures such as comparing one-hot versus target encoding [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>] and evaluating synthetic data augmentation techniques like the Synthetic Minority Over-sampling Technique (SMOTE) [<xref ref-type="bibr" rid="ref16">16</xref>], systematic evaluations of the interactive effects among diverse preprocessing workflows are exceedingly scarce, particularly across heterogeneous medical datasets. Furthermore, there is a notable gap in comparative research on encoding strategies (eg, one-hot, frequency, and target encoding) and imbalance correction methods (eg, SMOTE and Random Over Sampling Example [ROSE]), which hampers the establishment of best practices in clinical ML.</p><p>This study proposes a comprehensive, reproducible framework specifically for medical tabular data. The framework is built upon three core components: (1) the Data Processing Strategy Layer, which systematically evaluates essential preprocessing techniques, including missing value imputation, variable encoding, and class imbalance correction; (2) the Model Selection and Optimization Layer, which ensures compatibility with a diverse range of supervised learning algorithms; and (3) Cross-Dataset Validation, which tests the framework&#x2019;s transferability and consistency on 2 highly heterogeneous real-world clinical datasets. This design not only streamlines the preprocessing pipeline but also minimizes overlap with subsequent methodological details.</p><p>The primary contribution of this work lies in developing and empirically validating an end-to-end data processing framework that transcends the limitations of single-model, single-dataset analyses. By shifting the focus from solely model-centric performance metrics to a holistic methodological architecture, our approach provides both data scientists and clinical researchers with a modular and standardized workflow. This framework is expected to bridge the gap between algorithm development and clinical application, offering robust empirical evidence and actionable guidance for advancing predictive modeling in health care.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Collection</title><p>Initially, we enrolled 542 adult patients with ESRD undergoing hemodialysis at the hemodialysis unit of a medical center between October 1, 2018, and December 31, 2021. Patients who had received hemodialysis for less than 3 months or who were transferred to other clinics during the study period were excluded. After these exclusions, 412 adult patients with ESRD undergoing chronic hemodialysis without transfer remained eligible for analysis. Among them, 242 patients had no occurrence of major adverse cardiovascular events (MACEs), while 170 experienced at least one MACE. The primary objective of this study was to determine the incidence of MACEs in this population. A flowchart of the study participants is presented in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The detailed baseline demographic and clinical characteristics of the cohort are summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of study population selection and major adverse cardiovascular event (MACE) grouping. ESRD: end-stage renal disease; MACE: major adverse cardiovascular event;</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e75655_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Initial demographic and clinical profiles of the research cohort.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Variables</td><td align="left" valign="bottom" rowspan="2">Overall (n=412)</td><td align="left" valign="bottom" colspan="2">MACE<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom" rowspan="2"><italic>P</italic> value</td></tr><tr><td align="left" valign="bottom">Never occurred<break/>(n=242)</td><td align="left" valign="bottom">Occurred<break/>(n=170)</td></tr></thead><tbody><tr><td align="left" valign="top">Age, mean (SD)</td><td align="left" valign="top">69.19 (12.14)</td><td align="left" valign="top">67.96 (12.59)</td><td align="left" valign="top">70.94 (11.29)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top">Sex (female), n (%)</td><td align="left" valign="top">192 (46.6)</td><td align="left" valign="top">122 (50.4)</td><td align="left" valign="top">70 (41.2)</td><td align="left" valign="top">.08</td></tr><tr><td align="left" valign="top">AVG (Arteriovenous graft), n (%)</td><td align="left" valign="top">57 (13.8)</td><td align="left" valign="top">31 (12.8)</td><td align="left" valign="top">26 (15.3)</td><td align="left" valign="top">.57</td></tr><tr><td align="left" valign="top">AV cal, n (%)<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">196 (62.6)</td><td align="left" valign="top">106 (56.1)</td><td align="left" valign="top">90 (72.6)</td><td align="left" valign="top">.005</td></tr><tr><td align="left" valign="top">AR<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>None</td><td align="left" valign="top">178 (43.2)</td><td align="left" valign="top">115 (47.5)</td><td align="left" valign="top">63 (37.1)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Negligible or mild</td><td align="left" valign="top">125 (30.3)</td><td align="left" valign="top">68 (28.1)</td><td align="left" valign="top">57 (33.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Moderate</td><td align="left" valign="top">8 (1.9)</td><td align="left" valign="top">3 (1.2)</td><td align="left" valign="top">5 (2.9)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">AS, n (%)<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.03</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>-</td><td align="left" valign="top">101 (24.5)</td><td align="left" valign="top">56 (23.1)</td><td align="left" valign="top">45 (26.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>None</td><td align="left" valign="top">277 (67.2)</td><td align="left" valign="top">172 (71.1)</td><td align="left" valign="top">105 (61.8)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Negligible or mild</td><td align="left" valign="top">22 (5.3)</td><td align="left" valign="top">12 (5.0)</td><td align="left" valign="top">10 (5.9)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Moderate</td><td align="left" valign="top">11 (2.7)</td><td align="left" valign="top">2 (0.8)</td><td align="left" valign="top">9 (5.3)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Severe</td><td align="left" valign="top">1 (0.2)</td><td align="left" valign="top">0 (0.0)</td><td align="left" valign="top">1 (0.6)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">LVH type, n (%)<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.02</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">131 (43.8)</td><td align="left" valign="top">72 (39.8)</td><td align="left" valign="top">59 (50.0)</td><td align="left" valign="top"/></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">106 (35.5)</td><td align="left" valign="top">63 (34.8)</td><td align="left" valign="top">43 (36.4)</td><td align="left" valign="top"/></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">28 (9.4)</td><td align="left" valign="top">24 (13.3)</td><td align="left" valign="top">4 (3.4)</td><td align="left" valign="top"/></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">34 (11.4)</td><td align="left" valign="top">22 (12.2)</td><td align="left" valign="top">12 (10.2)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5">Comorbidities, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DM<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td><td align="left" valign="top">198 (48.1)</td><td align="left" valign="top">99 (40.9)</td><td align="left" valign="top">99 (58.2)</td><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Dyslipid</td><td align="left" valign="top">220 (53.4)</td><td align="left" valign="top">130 (53.7)</td><td align="left" valign="top">90 (52.9)</td><td align="left" valign="top">.96</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PAOD<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td><td align="left" valign="top">111 (26.9)</td><td align="left" valign="top">53 (21.9)</td><td align="left" valign="top">58 (34.1)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top" colspan="5">Medication, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Insulin</td><td align="left" valign="top">85 (20.6)</td><td align="left" valign="top">33 (13.6)</td><td align="left" valign="top">52 (30.6)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Statin</td><td align="left" valign="top">137 (33.3)</td><td align="left" valign="top">73 (30.2)</td><td align="left" valign="top">64 (37.6)</td><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Antiplatelet</td><td align="left" valign="top">199 (48.3)</td><td align="left" valign="top">82 (33.9)</td><td align="left" valign="top">117 (68.8)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Calcitriol</td><td align="left" valign="top">173 (42.0)</td><td align="left" valign="top">104 (43.0)</td><td align="left" valign="top">69 (40.6)</td><td align="left" valign="top">.70</td></tr><tr><td align="left" valign="top">No of hypotension episodes, mean (SD)</td><td align="left" valign="top">5.41 (3.26)</td><td align="left" valign="top">5.46 (3.21)</td><td align="left" valign="top">5.34 (3.33)</td><td align="left" valign="top">.71</td></tr><tr><td align="left" valign="top">CXR_AoAC, n (%)<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0</td><td align="left" valign="top">120 (31.4)</td><td align="left" valign="top">90 (40.5)</td><td align="left" valign="top">30 (18.8)</td><td align="left" valign="top"/></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">83 (21.7)</td><td align="left" valign="top">49 (22.1)</td><td align="left" valign="top">34 (21.2)</td><td align="left" valign="top"/></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">107 (28.0)</td><td align="left" valign="top">49 (22.1)</td><td align="left" valign="top">58 (36.2)</td><td align="left" valign="top"/></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">72 (18.8)</td><td align="left" valign="top">34 (15.3)</td><td align="left" valign="top">38 (23.8)</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>MACE: major adverse cardiovascular event.</p></fn><fn id="table1fn2"><p><sup>b</sup>AV cal: aortic valve calculation</p></fn><fn id="table1fn3"><p><sup>c</sup>AR: aortic regurgitation.</p></fn><fn id="table1fn4"><p><sup>d</sup>AS: aortic stenosis.</p></fn><fn id="table1fn5"><p><sup>e</sup>LVH: left ventricular hypertrophy.</p></fn><fn id="table1fn6"><p><sup>f</sup>DM: diabetes mellitus.</p></fn><fn id="table1fn7"><p><sup>g</sup>PAOD: peripheral arterial occlusion disease.</p></fn><fn id="table1fn8"><p><sup>h</sup>CXR_AoAC: chest X-ray for aortic arch calcification.</p></fn></table-wrap-foot></table-wrap><p>This study specifically targeted the unique clinical needs and complexities of patients with ESRD by collecting 84 variables. These variables were meticulously selected based on their significant impact on clinical outcomes in patients with ESRD. Demographic data, such as age and gender, were included, and dialysis vintage reflected the duration and history of each patient&#x2019;s dialysis treatment. Additionally, we detailed the anatomical and functional characteristics of the aortic and mitral valves, which are crucial for understanding cardiovascular complications in patients with ESRD. Specifically, we included parameters such as types of arteriovenous access (AVA), mitral valve calcification (MV calc), aortic regurgitation (AR), aortic stenosis (AS), mitral regurgitation (MR), and mitral stenosis (MS).</p><p>In terms of cardiovascular health, this study placed particular emphasis on the grading and types of left ventricular hypertrophy (LVH) [<xref ref-type="bibr" rid="ref17">17</xref>] and the ejection fraction (EF) of the heart, which are critical indicators of cardiovascular health in patients with ESRD. Given the high prevalence and impact of comorbidities such as diabetes mellitus, hypertension, dyslipidemia, coronary artery disease, heart failure, chronic obstructive pulmonary disease, liver cirrhosis, malignancy, arrhythmia, and a history of amputation among patients with ESRD, we included these comorbidities in our analysis.</p><p>To better meet the clinical needs of patients with ESRD, we expanded the range of biochemical laboratory data. This included comprehensive assessments of total protein, albumin, liver enzymes (aspartate aminotransferase and alanine aminotransferase), alkaline phosphatase [<xref ref-type="bibr" rid="ref18">18</xref>], total bilirubin, lipid profiles, glucose levels, complete blood count, iron studies, aluminum levels, postdialysis weight, uric acid, and key electrolytes. To address the specific needs of patients with ESRD, we further measured calcium and phosphate metabolism indicators, such as calcium and phosphate levels, urea kinetics (Kt/V), parathyroid hormone levels, and the calcium-phosphate product, to more effectively manage mineral and bone disorders in these patients.</p><p>The medication history was also thoroughly documented, particularly focusing on drugs commonly used in the management of ESRD, such as phosphate binders, calcitriol, and other treatments relevant to the patients&#x2019; condition.</p><p>This study included 412 patients. The mean age was 69.19 years, with older patients more likely to experience MACE (70.94 years vs 67.96 years, <italic>P</italic>=.01). Females comprised 46.6% of the cohort, with a lower proportion in the MACE group, although this difference was not statistically significant (<italic>P</italic>=.08). Aortic valve calcification was more prevalent in the MACE group (72.6% vs 56.1%, <italic>P</italic>=.005), and AS as well as certain types of LVH were also associated with the occurrence of MACE. Diabetes mellitus (DM) and peripheral arterial occlusive disease (PAOD) were more common among patients who experienced MACE (DM: 58.2% vs 40.9%; <italic>P</italic>=.001; PAOD: 34.1% vs 21.9%; <italic>P</italic>=.008). In terms of medication use, a higher proportion of patients in the MACE group were on insulin (<italic>P</italic>&#x003C;.001) and antiplatelet drugs (<italic>P</italic>&#x003C;.001). These results provide an overview of the baseline characteristics of the patients and offer important references for improving the accuracy of predictive models.</p></sec><sec id="s2-2"><title>Data Preparation</title><p>The data preprocessing methodology in this study is organized into 2 primary components: variable encoding and data balancing. The overall analytical workflow and preprocessing framework are illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>. This integrated approach is designed to enhance the quality and representativeness of clinical datasets, thereby improving the robustness and generalizability of subsequent predictive models.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Research analysis process framework diagram. Preprocessing components are fit on training folds only and then applied to validation folds; ROSE or SMOTE are applied to training folds only to prevent leakage.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e75655_fig02.png"/></fig></sec><sec id="s2-3"><title>Data Imputation</title><p>Missing data are a common challenge in clinical datasets and can significantly compromise the validity of statistical inferences if not appropriately addressed. In this study, we used a nonparametric multiple imputation strategy using the <italic>missForest</italic> package in R (R Core Team) [<xref ref-type="bibr" rid="ref19">19</xref>]. This method uses random forest models to iteratively predict missing values based on observed data, effectively capturing complex nonlinear associations and interactions between variables. Unlike simpler imputation techniques such as mean substitution or k-nearest neighbors, missForest has been shown to yield more accurate and less biased estimates in both continuous and categorical variables, particularly in mixed type medical data [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. This approach offers a robust and flexible foundation for downstream ML analysis while preserving the integrity of the original dataset.</p><p>Compared to SMOTE, which generates new samples through linear interpolation between neighboring minority class points, ROSE applies a smoothed bootstrap technique. It estimates a kernel density around each minority instance and samples new points from this local distribution. This approach allows ROSE to preserve the original variance and nonlinear structure of the data, making it particularly effective for clinical datasets where preserving subtle distributional patterns is important.</p><p>Imputation models were fit on the training fold only and then applied to the corresponding validation fold within each split. No statistics from the validation fold were used to fit the imputation model. This fold-wise procedure was repeated across all folds to prevent information leakage.</p></sec><sec id="s2-4"><title>Variable Encoding and Expansion</title><p>In clinical datasets, categorical variables are abundant and require conversion into numerical formats to be effectively used in statistical and ML models. Three encoding methods were implemented to address this challenge, each offering a unique balance between preserving information and managing computational complexity. Although all 3 approaches aim to transform qualitative data into quantitative representations, they differ in their operational mechanisms and associated trade-offs.</p><p>One-hot encoding converts each categorical variable into a series of binary indicators, where each category is represented by an individual binary feature. This method maintains the inherent nonordinal nature of the original variable but can lead to a substantial increase in dimensionality, particularly when the variable in question has many unique categories. In contrast, target encoding substitutes each category with a statistical summary such as the mean, weighted mean, or smoothed mean of the target variable computed from the training dataset. This not only reduces dimensionality but also encapsulates the predictive relationship between the categorical feature and the outcome variable, although it necessitates careful handling to avoid target leakage [<xref ref-type="bibr" rid="ref14">14</xref>]. A third approach, frequency encoding, assigns to each category a numerical value based on its relative frequency within the dataset. This method is highly efficient in reducing computational burden and memory usage, as it compresses categorical information into a single continuous variable without imposing any artificial order [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>To combine the complementary strengths of these methods, a unified encoding strategy was adopted. Specifically, the one_hot function from the <italic>mltools</italic> package in R was applied to perform one-hot encoding, which expanded the original set of 83 variables to 113. This expansion increased the granularity of the dataset, facilitating a more detailed representation of the clinical phenomena under this research. The inclusion of target and frequency encoding provided an additional layer of comparison, enabling an evaluation of their relative performance under conditions of significant missingness and class imbalance. Prior work has demonstrated that, particularly in datasets with high proportions of missing data and imbalanced classes, preprocessing methods based on one-hot encoding can significantly enhance both accuracy and robustness in classification tasks [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>All categorical encoders were fit on the training fold only and then applied to its validation fold. Target encoding used an out-of-fold smoothing scheme: for each fold, category means and smoothing weights were computed from the training fold and then mapped to the validation fold. No target information from the validation fold was used to compute encodings.</p></sec><sec id="s2-5"><title>Data Imbalance</title><p>Clinical datasets often exhibit imbalanced class distributions, where the minority class, despite its clinical significance, is underrepresented. Such imbalance can lead to biased models that disproportionately favor the majority class. To counteract this, 2 complementary strategies, Random Over-Sampling (ROS) and the SMOTE, were incorporated, along with an evaluation of the ROSE method [<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>The ROSE method uses a bootstrap resampling framework augmented by kernel density estimation to generate synthetic samples for the minority class. This approach avoids the pitfalls associated with simply duplicating minority samples, offering a more nuanced correction of the class distribution. Menardi and Torelli [<xref ref-type="bibr" rid="ref23">23</xref>] provide an extensive discussion of these resampling techniques, emphasizing their utility in balancing datasets for binary classification tasks. On the other hand, SMOTE, as introduced by Chawla et al [<xref ref-type="bibr" rid="ref24">24</xref>], synthesizes new minority class instances by interpolating between existing samples. This technique enriches the minority class by generating additional, diverse examples, thereby improving the model&#x2019;s ability to capture the characteristics of rare events. Empirical studies have shown that SMOTE can lead to marked improvements in performance metrics, such as the area under the ROC curve (AUC), though it may overgeneralize when faced with extreme imbalance.</p><p>Comparative evaluations in the literature further underscore the respective merits of these techniques. For instance, Kamalov et al [<xref ref-type="bibr" rid="ref25">25</xref>] found that ROSE, despite its relative simplicity, remains a stable and computationally efficient solution, especially in multi-label contexts. Similarly, investigations by Gnip et al [<xref ref-type="bibr" rid="ref26">26</xref>] and Nguyen et al [<xref ref-type="bibr" rid="ref27">27</xref>] have confirmed that both ROSE and SMOTE are effective in mitigating the adverse effects of class imbalance, with the optimal choice being contingent upon the specific characteristics of the dataset and the available computational resources.</p><sec id="s2-5-1"><title>Integration of Preprocessing Components</title><p>The overall preprocessing workflow was designed to integrate the 2 components variable encoding and data balancing, into a coherent sequence. Initially, missing values were imputed using the random forest approach, ensuring that the dataset was complete and reliable. This was followed by the transformation of categorical variables via the combined encoding strategy, which not only translated qualitative data into numerical features but also expanded the feature set to enhance data granularity. Finally, class imbalance was addressed through the application of ROSE and SMOTE, thereby ensuring that the resulting dataset was well-suited for the development of predictive models.</p><p>Each preprocessing step was carefully implemented so that subsequent operations built upon an increasingly refined version of the dataset. The robust imputation stage preserved the original data&#x2019;s distributional properties, while the encoding procedures facilitated the construction of a rich, multidimensional feature space. The balancing techniques further adjusted the dataset to prevent bias toward the majority class, enabling the models to more effectively capture the subtleties of clinically significant, albeit rare, events.</p></sec><sec id="s2-5-2"><title>Methodological Rationale and Literature Justification</title><p>The methodological choices articulated above are firmly grounded in this body of literature. The use of a random forest&#x2013;based imputation method is well-supported by studies demonstrating its efficacy in preserving data structure and ensuring the validity of statistical inferences [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Similarly, the comparative evaluation of encoding methods draws on prior research that highlights the trade-offs between dimensionality, computational efficiency, and the risk of target leakage [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Moreover, the advantages of one-hot encoding in contexts marked by high missingness and class imbalance have been substantiated by empirical investigations [<xref ref-type="bibr" rid="ref15">15</xref>]. The incorporation of data balancing strategies, including ROSE and SMOTE, is equally well-documented, with seminal works establishing their effectiveness in correcting class imbalances [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>], and more recent studies further validating these methods [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>The data preprocessing methodology presented herein represents a rigorous and systematic approach to overcoming the multifaceted challenges inherent in clinical datasets. By sequentially addressing missing data, encoding categorical variables into a more informative numerical format, and correcting class imbalances, the workflow transforms raw clinical data into a format that is both analytically robust and amenable to predictive modeling. This integrated preprocessing pipeline is instrumental in bridging the gap between the complexities of clinical data and the demands of advanced statistical and ML techniques, ultimately contributing to the development of models that are both reliable and clinically pertinent.</p><p>Class rebalancing was applied only to the training portion of each fold. Validation folds preserved the original class distribution. For transparency and reproducibility, we report resampling parameters in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, including SMOTE and ROSE sampling ratios and the random seed used.</p></sec></sec><sec id="s2-6"><title>Data Segmentation</title><p>k-fold cross-validation is a commonly used resampling technique for evaluating the performance of ML models. Specifically, the dataset is divided into k subsets (folds), and in each iteration, one subset is used as the validation set while the remaining k-1 subsets are used as the training set. This process is repeated k times, with a different subset used for validation each time.</p><p>This method effectively prevents overfitting and provides a more robust evaluation of the model. It is particularly useful for estimating the generalization error in small datasets [<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>In this study, we used 5-fold cross-validation to obtain an accurate estimate of model performance through multiple splits and evaluations while reducing bias caused by data partitioning.</p></sec><sec id="s2-7"><title>Robust Model Engineering: Tuning, Processing, and Risk Mitigation</title><p>All preprocessing and modeling steps, including cleaning, standardization, encoding, resampling, model fitting, and hyperparameter tuning, were executed separately within each dataset. No features, encoders, parameters, or statistics were transferred across datasets. For each model, hyperparameters were tuned by grid search with 5-fold cross-validation on the training set. A single predefined hyperparameter search space and evaluation criterion, detailed in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, was used for both ESRD and BRFSS. Grid search and model selection were rerun independently in each dataset so that optimal hyperparameters were learned within ESRD and BRFSS separately and were never reused across datasets.</p></sec><sec id="s2-8"><title>Model Building and Validation-Phase 1</title><p>In Phase 1, we compared alternative encoding and class-imbalance handling strategies for predicting MACE. Traditional logistic regression and 6 ML models were evaluated, including decision trees [<xref ref-type="bibr" rid="ref29">29</xref>], random forests [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], XGBoost [<xref ref-type="bibr" rid="ref32">32</xref>], CatBoost [<xref ref-type="bibr" rid="ref33">33</xref>], and LightGBM [<xref ref-type="bibr" rid="ref34">34</xref>]. All models were trained and evaluated using 5-fold cross-validation [<xref ref-type="bibr" rid="ref35">35</xref>] within each dataset, following the generic engineering framework described in the previous subsection, that is, grid search hyperparameter tuning with the shared search space in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and dataset-specific model selection.</p><p>The workflow, including preprocessing, resampling, and hyperparameter tuning, was then rerun independently on the BRFSS 2015 dataset to assess cross-dataset portability of the pipeline rather than to externally validate a single ESRD-trained model. All training was executed on a workstation equipped with an Intel Core i9 10th-generation CPU (3.3 GHz) and 64 GB RAM, using the CPU only. Average runtimes for each pipeline are reported in <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref> to help readers gauge computational cost. In this phase, multiple performance metrics were used, including accuracy, sensitivity, specificity, precision, <italic>F</italic><sub>1</sub>-score, and AUC. The mean and SD of these metrics were calculated across folds to assess predictive accuracy, robustness, and consistency. The overall workflow is summarized in algorithm S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Evaluation metrics of 5-fold cross-validation using all encoding methods and imbalanced data processing methods.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="3">Encoding, imbalance, and<break/>machine learning method</td><td align="left" valign="bottom" colspan="2">Accuracy</td><td align="left" valign="bottom" colspan="2"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">Runtime (sec)</td></tr><tr><td align="left" valign="top" colspan="3"/><td align="left" valign="top">Mean (SD)</td><td align="left" valign="top">95% CI</td><td align="left" valign="top">Mean (SD)</td><td align="left" valign="top">95% CI</td><td align="left" valign="top">Mean (SD)</td><td align="left" valign="top">95% CI</td><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">One-hot encoding</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SMOTE<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="char" char="." valign="top">0.711 (0.048)</td><td align="char" char="." valign="top">0.680&#x2010;0.752</td><td align="char" char="." valign="top">0.660 (0.062)</td><td align="char" char="." valign="top">0.611&#x2010;0.739</td><td align="char" char="." valign="top">0.708 (0.050)</td><td align="char" char="." valign="top">0.658&#x2010;0.757</td><td align="left" valign="top">23.9</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CAT<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="char" char="." valign="top">0.706 (0.062)</td><td align="char" char="." valign="top">0.701&#x2010;0.751</td><td align="char" char="." valign="top">0.689 (0.038)</td><td align="char" char="." valign="top">0.653&#x2010;0.726</td><td align="char" char="." valign="top">0.739 (0.044)</td><td align="char" char="." valign="top">0.695&#x2010;0.781</td><td align="left" valign="top">124.0</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LightGBM<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="char" char="." valign="top">0.702 (0.043)</td><td align="char" char="." valign="top">0.675&#x2010;0.723</td><td align="char" char="." valign="top">0.694 (0.041)</td><td align="char" char="." valign="top">0.666&#x2010;0.705</td><td align="char" char="." valign="top">0.713 (0.032)</td><td align="char" char="." valign="top">0.670&#x2010;0.764</td><td align="left" valign="top">17.4</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ROSE<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF</td><td align="char" char="." valign="top">0.932 (0.106)<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.783&#x2010;1.000</td><td align="char" char="." valign="top">0.917 (0.131)<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.752&#x2010;1.000</td><td align="char" char="." valign="top">0.938 (0.117)<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.793&#x2010;1.000</td><td align="left" valign="top">23.9<bold><sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></bold></td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CAT</td><td align="char" char="." valign="top">0.918 (0.094)<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.795&#x2010;1.000</td><td align="char" char="." valign="top">0.890 (0.131)<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.730&#x2010;0.995</td><td align="char" char="." valign="top">0.932 (0.103)<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.788&#x2010;1.000</td><td align="left" valign="top">401.0<bold><sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></bold></td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LightGBM</td><td align="char" char="." valign="top">0.932 (0.112)<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.759&#x2010;1.000</td><td align="char" char="." valign="top">0.918 (0.137)<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.754&#x2010;1.000</td><td align="char" char="." valign="top">0.940 (0.116)<sup><xref ref-type="table-fn" rid="table2fn7"><bold>g</bold></xref></sup></td><td align="char" char="." valign="top">0.794&#x2010;1.000</td><td align="left" valign="top">19.7<bold><sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></bold></td></tr><tr><td align="left" valign="top" colspan="3">Frequency encoding</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SMOTE</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LGR</td><td align="char" char="." valign="top">0.604 (0.094)</td><td align="char" char="." valign="top">0.433&#x2010;0.660</td><td align="char" char="." valign="top">0.502 (0.052)</td><td align="char" char="." valign="top">0.188&#x2010;0.518</td><td align="char" char="." valign="top">0.565 (0.058)</td><td align="char" char="." valign="top">0.493&#x2010;0.582</td><td align="left" valign="top">0.110</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CAT</td><td align="char" char="." valign="top">0.619 (0.079)</td><td align="char" char="." valign="top">0.479&#x2010;0.646</td><td align="char" char="." valign="top">0.600 (0.033)</td><td align="char" char="." valign="top">0.375&#x2010;0.627</td><td align="char" char="." valign="top">0.613 (0.042)</td><td align="char" char="." valign="top">0.498&#x2010;0.692</td><td align="left" valign="top">106.0</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LightGBM</td><td align="char" char="." valign="top">0.619 (0.102)</td><td align="char" char="." valign="top">[0.321&#x2010;0.630]</td><td align="char" char="." valign="top">0.490 (0.086)</td><td align="char" char="." valign="top">[0.304&#x2010;0.580]</td><td align="char" char="." valign="top">0.602 (0.069)</td><td align="char" char="." valign="top">0.515&#x2010;0.663</td><td align="left" valign="top">15.0</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ROSE</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF</td><td align="char" char="." valign="top">0.896 (0.158)</td><td align="char" char="." valign="top">0.666&#x2010;1.000</td><td align="char" char="." valign="top">0.867 (0.206)</td><td align="char" char="." valign="top">0.553&#x2010;1.000</td><td align="char" char="." valign="top">0.899 (0.194)</td><td align="char" char="." valign="top">0.623&#x2010;1.000</td><td align="left" valign="top">19.7</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CAT</td><td align="char" char="." valign="top">0.886 (0.127)</td><td align="char" char="." valign="top">0.678&#x2010;1.000</td><td align="char" char="." valign="top">0.864 (0.150)</td><td align="char" char="." valign="top">0.694&#x2010;1.000</td><td align="char" char="." valign="top">0.913 (0.140)</td><td align="char" char="." valign="top">0.690&#x2010;1.000</td><td align="left" valign="top">329.0</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LightGBM</td><td align="char" char="." valign="top">0.870 (0.145)</td><td align="char" char="." valign="top">0.708&#x2010;1.000</td><td align="char" char="." valign="top">0.825 (0.213)</td><td align="char" char="." valign="top">0.576&#x2010;1.000</td><td align="char" char="." valign="top">0.886 (0.175)</td><td align="char" char="." valign="top">0.707&#x2010;1.000</td><td align="left" valign="top">19.9</td></tr><tr><td align="left" valign="top" colspan="3">Target encoding</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SMOTE</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LGR</td><td align="char" char="." valign="top">0.621 (0.044)</td><td align="char" char="." valign="top">0.324&#x2010;0.684</td><td align="char" char="." valign="top">0.507 (0.102)</td><td align="char" char="." valign="top">0.213&#x2010;0.567</td><td align="char" char="." valign="top">0.576 (0.055)</td><td align="char" char="." valign="top">0.497&#x2010;0.587</td><td align="left" valign="top">0.110</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CAT</td><td align="char" char="." valign="top">0.648 (0.038)</td><td align="char" char="." valign="top">0.477&#x2010;0.649</td><td align="char" char="." valign="top">0.600 (0.053)</td><td align="char" char="." valign="top">0.266&#x2010;0.662</td><td align="char" char="." valign="top">0.624 (0.030)</td><td align="char" char="." valign="top">0.492&#x2010;0.687</td><td align="left" valign="top">106.0</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LightGBM</td><td align="char" char="." valign="top">0.646 (0.031)</td><td align="char" char="." valign="top">0.464&#x2010;0.638</td><td align="char" char="." valign="top">0.515 (0.182)</td><td align="char" char="." valign="top">0.242&#x2010;0.708</td><td align="char" char="." valign="top">0.605 (0.058)</td><td align="char" char="." valign="top">0.512&#x2010;0.671</td><td align="left" valign="top">14.8</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ROSE</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF</td><td align="char" char="." valign="top">0.882 (0.184)</td><td align="char" char="." valign="top">0.565&#x2010;1.000</td><td align="char" char="." valign="top">0.866 (0.202)</td><td align="char" char="." valign="top">0.553&#x2010;1.000</td><td align="char" char="." valign="top">0.893 (0.204)</td><td align="char" char="." valign="top">0.615&#x2010;1.000</td><td align="left" valign="top">19.4</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CAT</td><td align="char" char="." valign="top">0.876 (0.131)</td><td align="char" char="." valign="top">0.641&#x2010;1.000</td><td align="char" char="." valign="top">0.828 (0.214)</td><td align="char" char="." valign="top">0.651&#x2010;1.000</td><td align="char" char="." valign="top">0.900 (0.159)</td><td align="char" char="." valign="top">0.638&#x2010;1.000</td><td align="left" valign="top">285.0</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LightGBM</td><td align="char" char="." valign="top">0.891 (0.155)</td><td align="char" char="." valign="top">0.669&#x2010;1.000</td><td align="char" char="." valign="top">0.850 (0.225)</td><td align="char" char="." valign="top">0.602&#x2010;1.000</td><td align="char" char="." valign="top">0.889 (0.178)</td><td align="char" char="." valign="top">0.634&#x2010;1.000</td><td align="left" valign="top">19.5</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AUC: area under the ROC curve.</p></fn><fn id="table2fn2"><p><sup>b</sup>SMOTE: Synthetic Minority Over-sampling Technique.</p></fn><fn id="table2fn3"><p><sup>c</sup>RF: random forest.</p></fn><fn id="table2fn4"><p><sup>d</sup>CAT: CatBoost.</p></fn><fn id="table2fn5"><p><sup>e</sup>LightGBM: light gradient boosting machine.</p></fn><fn id="table2fn6"><p><sup>f</sup>ROSE: Random Over Sampling Examples</p></fn><fn id="table2fn7"><p><sup>g</sup>This method has the highest model accuracy and a small standard deviation, indicating that it has a relatively good ability to make judgments</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Evaluation metrics of remodeling with the top 15 most important variables, using one-hot encoding and SMOTE or ROSE.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">ETL and machine learning methods</td><td align="left" valign="bottom" colspan="2">Accuracy</td><td align="left" valign="bottom" colspan="2"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">Runtime (sec)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2"/><td align="left" valign="top">mean (SD)</td><td align="left" valign="top">95% CI</td><td align="left" valign="top">mean (SD)</td><td align="left" valign="top">95% CI</td><td align="left" valign="top">mean (SD)</td><td align="left" valign="top">95% CI</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="2">SMOTE<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LGR<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="char" char="." valign="top">0.723 (0.045)</td><td align="char" char="." valign="top">0.668&#x2010;0.779</td><td align="char" char="." valign="top">0.680 (0.050)</td><td align="char" char="." valign="top">0.618&#x2010;0.743</td><td align="char" char="." valign="top">0.728 (0.044)</td><td align="char" char="." valign="top">0.673&#x2010;0.783</td><td align="left" valign="top">0.08</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DT<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="char" char="." valign="top">0.636 (0.035)</td><td align="char" char="." valign="top">0.592&#x2010;0.680</td><td align="char" char="." valign="top">0.547 (0.073)</td><td align="char" char="." valign="top">0.457&#x2010;0.637</td><td align="char" char="." valign="top">0.622 (0.044)</td><td align="char" char="." valign="top">0.567&#x2010;0.676</td><td align="left" valign="top">2.97</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="char" char="." valign="top">0.699 (0.027)</td><td align="char" char="." valign="top">0.665&#x2010;0.733</td><td align="char" char="." valign="top">0.675 (0.027)</td><td align="char" char="." valign="top">0.642&#x2010;0.709</td><td align="char" char="." valign="top">0.711 (0.020)</td><td align="char" char="." valign="top">0.687&#x2010;0.736</td><td align="left" valign="top">9.79</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGB<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="char" char="." valign="top">0.658 (0.021)</td><td align="char" char="." valign="top">0.632&#x2010;0.684</td><td align="char" char="." valign="top">0.630 (0.052)</td><td align="char" char="." valign="top">0.566&#x2010;0.694</td><td align="char" char="." valign="top">0.670 (0.024)</td><td align="char" char="." valign="top">0.641&#x2010;0.700</td><td align="left" valign="top">2.59</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CAT<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="char" char="." valign="top">0.699 (0.044)</td><td align="char" char="." valign="top">0.644&#x2010;0.754</td><td align="char" char="." valign="top">0.703 (0.027)</td><td align="char" char="." valign="top">0.669&#x2010;0.737</td><td align="char" char="." valign="top">0.737 (0.021)</td><td align="char" char="." valign="top">0.711&#x2010;0.763</td><td align="left" valign="top">93.8</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LightGBM<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="char" char="." valign="top">0.709 (0.030)</td><td align="char" char="." valign="top">0.671&#x2010;0.747</td><td align="char" char="." valign="top">0.682 (0.045)</td><td align="char" char="." valign="top">0.626&#x2010;0.738</td><td align="char" char="." valign="top">0.717 (0.034)</td><td align="char" char="." valign="top">0.675&#x2010;0.760</td><td align="left" valign="top">14.4</td></tr><tr><td align="left" valign="top" colspan="2">ROSE<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LGR</td><td align="char" char="." valign="top">0.738 (0.030)</td><td align="char" char="." valign="top">0.700&#x2010;0.776</td><td align="char" char="." valign="top">0.699 (0.050)</td><td align="char" char="." valign="top">0.636&#x2010;0.762</td><td align="char" char="." valign="top">0.755 (0.050)</td><td align="char" char="." valign="top">0.694&#x2010;0.817</td><td align="left" valign="top">0.08</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DT</td><td align="char" char="." valign="top">0.755 (0.089)</td><td align="char" char="." valign="top">0.644&#x2010;0.866</td><td align="char" char="." valign="top">0.703 (0.126)</td><td align="char" char="." valign="top">0.547&#x2010;0.860</td><td align="char" char="." valign="top">0.750 (0.100)</td><td align="char" char="." valign="top">0.626&#x2010;0.875</td><td align="left" valign="top">2.96</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RF</td><td align="char" char="." valign="top">0.891 (0.131)</td><td align="char" char="." valign="top">0.728&#x2010;1.000</td><td align="char" char="." valign="top">0.879 (0.133)</td><td align="char" char="." valign="top">0.713&#x2010;1.000</td><td align="char" char="." valign="top">0.915 (0.133)<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="char" char="." valign="top">0.750&#x2010;1.000</td><td align="left" valign="top">9.05</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>XGB</td><td align="char" char="." valign="top">0.913 (0.122)<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="char" char="." valign="top">0.762&#x2010;1.000</td><td align="char" char="." valign="top">0.899 (0.138)<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="char" char="." valign="top">0.728&#x2010;1.000</td><td align="char" char="." valign="top">0.906 (0.137)</td><td align="char" char="." valign="top">0.736&#x2010;1.000</td><td align="left" valign="top">3.28</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CAT</td><td align="char" char="." valign="top">0.862 (0.073)</td><td align="char" char="." valign="top">0.771&#x2010;0.952</td><td align="char" char="." valign="top">0.831 (0.092)</td><td align="char" char="." valign="top">0.717&#x2010;0.945</td><td align="char" char="." valign="top">0.896 (0.092)</td><td align="char" char="." valign="top">0.782&#x2010;1.000</td><td align="left" valign="top">270<bold><sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></bold></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LightGBM</td><td align="char" char="." valign="top">0.906 (0.115)<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="char" char="." valign="top">0.763&#x2010;1.000</td><td align="char" char="." valign="top">0.892 (0.119<bold>)</bold><sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="char" char="." valign="top">0.745&#x2010;1.000</td><td align="char" char="." valign="top">0.925 (0.112)<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="char" char="." valign="top">0.786&#x2010;1.000</td><td align="left" valign="top">19.2</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUC: area under the ROC curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>SMOTE: Synthetic Minority Over-sampling Technique.</p></fn><fn id="table3fn3"><p><sup>c</sup>LGR: logistic regression.</p></fn><fn id="table3fn4"><p><sup>d</sup>DT: decision tree.</p></fn><fn id="table3fn5"><p><sup>e</sup>RF: random forest.</p></fn><fn id="table3fn6"><p><sup>f</sup>XGB: extreme gradient boosting.</p></fn><fn id="table3fn7"><p><sup>g</sup>CAT: CatBoost</p></fn><fn id="table3fn8"><p><sup>h</sup>LightGBM: light gradient boosting machine</p></fn><fn id="table3fn9"><p><sup>i</sup>ROSE: random over sampling example.</p></fn><fn id="table3fn10"><p><sup>j</sup>This method's model results have the highest AUC and a small standard deviation, indicating a relatively good ability to make judgments.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-9"><title>Performance Estimation and Statistical Testing</title><p>Within each dataset, model performance was summarized from 5-fold cross-validation. For each metric (AUC, accuracy, and <italic>F</italic><sub>1</sub>-score), we calculated 95% CIs from the 5-fold&#x2013;specific estimates using a 2-sided Student t interval. This procedure corresponds to applying the <italic>t.test</italic> function in R to the vector of fold-level values. Because these metrics are bounded between 0 and 1, upper limits were truncated at 1.0 when necessary, and models with zero variance across folds yield a point interval at the mean.</p><p>For pairwise AUC comparisons within the same dataset, we applied the nonparametric DeLong test to out-of-fold predictions pooled across the 5 validation folds, using the <italic>roc.test</italic> function from the <italic>pROC</italic> package. We report 2-sided <italic>P</italic> values; all tests are conducted within the dataset, and we do not compute pooled AUCs or conduct between-dataset hypothesis tests.</p></sec><sec id="s2-10"><title>Model Refinement and Variable Selection-Phase 2</title><p>In the second phase, the AUC [<xref ref-type="bibr" rid="ref36">36</xref>] was used to evaluate the models from the first phase. For the optimal data processing method and the corresponding best-performing ML model, a detailed scoring and ranking of variables was conducted based on their importance and contribution to the model&#x2019;s predictive performance.</p><p>The variables were converted into percentile rankings, with the most important variable assigned a score of 100, while the least important variable was assigned a score of 0. Using the 5-fold cross-validation approach, the scores from 5 iterations were summed and ranked, with the highest possible score being 500. Based on this scoring system, the top 15 highest-ranked variables were selected for remodeling, allowing for a more focused and in-depth analysis of their impact on the outcomes.</p></sec><sec id="s2-11"><title>Ethical Considerations</title><p>This study was reviewed and approved by the Institutional Review Board of Shin-Kong Wu Ho-Su Memorial Hospital (IRB No. 20231101R; approval date: December 14, 2023). The requirement for informed consent was waived by the ethics committee due to the retrospective nature of the study and the use of deidentified data. All procedures were conducted in accordance with the ethical standards of the responsible institutional and national committees on human experimentation and with the principles of the Declaration of Helsinki. The privacy and confidentiality of all participants were strictly protected throughout the study, and no personally identifiable information was disclosed. No compensation was provided to participants.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Phase 1</title><p>In this phase, we systematically evaluated the predictive performance of multiple ML models, including logistic regression, decision trees, random forests, XGBoost, CatBoost, and LightGBM, using 5-fold cross-validation. The primary objective was to determine the most effective combination of encoding methods (one-hot Encoding, frequency encoding, and target encoding) and data imbalance handling techniques (ROSE and SMOTE) for predicting MACE.</p><p>Models were evaluated by accuracy, <italic>F</italic><sub>1</sub>-score, and AUC with 5-fold cross-validation, and <xref ref-type="table" rid="table2">Table 2</xref> reports the fold mean, SD, and t-distribution&#x2013;based 95% CIs for each combination. Among all pipelines, the OneHotE_ROSE&#x2013;LightGBM model achieved the best overall performance, with mean accuracy of 0.932 (SD 0.112; 95% CI 0.759&#x2010;1.000), <italic>F</italic><sub>1</sub>-score of 0.918 (SD 0.137; 95% CI 0.754&#x2010;1.000), and AUC of 0.940 (SD 0.116; 95% CI 0.794&#x2010;1.000). Frequency and target encoding under ROSE also performed strongly (AUC 0.913 and 0.900, respectively), but at a slightly lower level than one-hot encoding in the same setting.</p><p>To formally assess differences between preprocessing strategies, we performed pairwise DeLong tests on out-of-fold AUCs, with results summarized in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Most ROSE pipelines showed significantly higher AUC than their SMOTE counterparts (all <italic>P</italic>&#x2264;.002), and OneHotE_ROSE in particular was markedly superior to OneHotE_SMOTE. Within the ROSE group, OneHotE_ROSE, FreqE_ROSE, and TargetE_ROSE did not differ significantly from each other (<italic>P</italic>&#x2265;.67), indicating a top-performing cluster. Together with the averaged ROC curves in <xref ref-type="fig" rid="figure3">Figures 3</xref><xref ref-type="fig" rid="figure4"/>-<xref ref-type="fig" rid="figure5">5</xref>, these results support selecting one-hot encoding with ROSE as the primary preprocessing strategy for Phase 2.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Comparison of the average area under the ROC curve (AUC) of different data models. AUC: area under the ROC curve; ROSE: Random Over Sampling Example; SMOTE: Synthetic Minority Over-sampling Technique.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e75655_fig03.png"/></fig><p>A direct comparison of encoding methods in <xref ref-type="fig" rid="figure4">Figure 4</xref> further confirmed that one-hot encoding was the most effective approach, achieving the highest average AUC (0.78), outperforming both frequency encoding and target encoding (both at 0.67). Meanwhile, <xref ref-type="fig" rid="figure5">Figure 5</xref> illustrates that ROSE significantly outperformed SMOTE, achieving an average AUC of 0.83, compared to 0.59 for SMOTE. This result reinforces the conclusion that ROSE generates more representative synthetic samples, preserves the original data distribution more effectively, and enhances model generalization.</p><p>Our findings strongly suggest that one-hot encoding combined with ROSE is the most effective preprocessing strategy for this predictive task. This combination preserves categorical feature integrity while mitigating class imbalance, leading to the most robust and accurate predictive models. In contrast, SMOTE not only struggled to enhance model performance but, in some cases, even contributed to its degradation. These insights guided the selection of the optimal model for further feature importance analysis in Phase 2.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Comparison of the average area under the ROC curve (AUC) of different encoding methods. AUC: area under the ROC curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e75655_fig04.png"/></fig><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Comparison of the average area under the ROC curve (AUC) of different imbalance methods. AUC: area under the ROC curve; ROSE: Random Over Sampling Example; SMOTE: Synthetic Minority Over-sampling Technique.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e75655_fig05.png"/></fig></sec><sec id="s3-2"><title>Phase 2</title><p>Following the identification of the optimal preprocessing strategy in Phase 1, we conducted a detailed analysis of feature importance and remodeled the dataset using the most influential variables. <xref ref-type="table" rid="table4">Table 4</xref> presents the top 15 most important variables ranked based on their cumulative scores from 5-fold cross-validation, with antiplatelet, chest X-ray for aortic arch calcification (CXR.AoAC).0, and insulin emerging as the most significant predictors. These variables were selected to refine the models, ensuring a more efficient and interpretable predictive framework.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>The top 15 variables ranked by importance after 5-fold cross-validation.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Ranking</td><td align="left" valign="bottom">Feature</td><td align="left" valign="bottom">Score<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Antiplatelet</td><td align="char" char="." valign="top">496</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">CXR.AoAC<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>.0</td><td align="char" char="." valign="top">466</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Insulin</td><td align="char" char="." valign="top">453</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">No.of.hypotension.episodes</td><td align="char" char="." valign="top">435</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">CXR.AoAC.2</td><td align="char" char="." valign="top">434</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Dyslipid</td><td align="char" char="." valign="top">410</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">DM<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="char" char="." valign="top">392</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">AVA<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="char" char="." valign="top">378</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">AR<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup>.0</td><td align="char" char="." valign="top">367</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">AV.cal<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="char" char="." valign="top">348</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">PAOD<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="char" char="." valign="top">337</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">LVH<sup><xref ref-type="table-fn" rid="table4fn8">h</xref></sup>.type.1</td><td align="char" char="." valign="top">302</td></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Calcitriol</td><td align="char" char="." valign="top">299</td></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">AS<sup><xref ref-type="table-fn" rid="table4fn9">i</xref></sup>.0</td><td align="char" char="." valign="top">295</td></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">Statin</td><td align="char" char="." valign="top">292</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>The score calculation method is as follows: in the 5-fold cross-validation, the fields are sorted according to their importance, with the highest being 100 and the lowest being 0. The scores from the 5 folds are then summed to obtain the final score.</p></fn><fn id="table4fn2"><p><sup>b</sup>CXR_AoAC: chest X-ray for aortic arch calcification.</p></fn><fn id="table4fn3"><p><sup>c</sup>DM: diabetes mellitus.</p></fn><fn id="table4fn4"><p><sup>d</sup>AVA: aortic valve area.</p></fn><fn id="table4fn5"><p><sup>e</sup>AR: aortic regurgitation.</p></fn><fn id="table4fn6"><p><sup>f</sup>AV cal: aortic valve calcification.</p></fn><fn id="table4fn7"><p><sup>g</sup>PAOD: peripheral arterial occlusive disease.</p></fn><fn id="table4fn8"><p><sup>h</sup>LVH: left ventricular hypertrophy.</p></fn><fn id="table4fn9"><p><sup>i</sup>AS: aortic stenosis.</p></fn></table-wrap-foot></table-wrap><p>The high-performing ML models from Phase 2 identified a set of correlated features predictive of MACE in patients with ESRD, which can be grouped into 3 main clusters, including structural predictors of MACE, systemic burdens with traditional risk factors, and other indicators of disease severity. The corresponding feature importance profile for the Phase 2 LightGBM model is visualized in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The first group includes aortic arch calcification statuses (CXR.AoAC.0 and CXR.AoAC.2), and concentric LVH (LVH.type.1). These features reflect the model&#x2019;s ability to capture structural markers representative of the broader spectrum of chronic kidney disease&#x2013;mineral and bone disorder pathology. Moderate AoAC and AV calcification signal cumulative metabolic and inflammatory insults from longstanding phosphate imbalance, high uremic toxin burden, and chronic inflammation [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>], while the absence of calcification indicates a protective cardiovascular profile to MACE risk. Additionally, concentric LVH has been long recognized as an independent predictor of MACE and all-cause mortality in patients with ESRD [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref40">40</xref>], results from chronic pressure overload, often due to arterial stiffness secondary to vascular calcification. Collectively, these structural changes contribute to hemodynamic consequences such as heart failure and fatal arrhythmias, thereby resulting in higher MACE risk [<xref ref-type="bibr" rid="ref41">41</xref>].</p><p>The second group of predictors comprises DM, dyslipidemia, and PAOD. DM and dyslipidemia are well-established contributors to atherosclerosis and are exacerbated in ESRD [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], forming a vicious cycle with both ESRD-specific and traditional risk factors. For example, hyperglycemia promotes the accumulation of advanced glycation end-products due to impaired clearance, leading to endothelial dysfunction, oxidative stress, and inflammation, thereby fueling the atherosclerosis process [<xref ref-type="bibr" rid="ref44">44</xref>]. PAOD, a direct manifestation of exacerbated systemic atherosclerosis, also shares similar underlying pathophysiology. Despite not being an isolated predictor, PAOD&#x2019;s strong association to higher MACE risk [<xref ref-type="bibr" rid="ref45">45</xref>] makes it a valuable proxy for assessing a patient&#x2019;s total atherosclerotic burden.</p><p>The third group includes antiplatelet, intradialytic hypotension (IDH), and valvular disease status. With a score of 496, antiplatelet use serves as a surrogate marker for clinician-identified ischemic risk and thus MACE vulnerability [<xref ref-type="bibr" rid="ref46">46</xref>]. Furthermore, the number of IDH episodes ranked fourth, underscoring the hemodynamic stress of dialysis, with cumulative episodes contributing to myocardial stunning and ischemic damage, thus in turn linked to MACE [<xref ref-type="bibr" rid="ref47">47</xref>]. Tracking IDH frequency could offer a more comprehensive view of cardiovascular stress rather than focusing on isolated episodes as well as the episode severity. In addition, the absence of valvular diseases such as AS or aortic regurgitation reflects a mitigated risk of LVH and atherosclerosis progression [<xref ref-type="bibr" rid="ref48">48</xref>]. The Phase 2 model effectively learned patterns of clinical practice&#x2014;such as medication use and hemodynamic instability&#x2014;offering real-world applicability and insights that extend beyond static risk markers to isolated hypotensive events.</p><p>The models were then retrained with one-hot encoding combined with either ROSE or SMOTE, and their performance was evaluated by accuracy, <italic>F</italic><sub>1</sub>-score, and AUC with 5-fold cross-validation (<xref ref-type="table" rid="table3">Table 3</xref>). Among the ROSE-based models, LightGBM and XGBoost achieved the highest discrimination, with LightGBM reaching accuracy 0.906 (SD 0.115; 95% CI 0.763&#x2010;1.000), <italic>F</italic><sub>1</sub>-score 0.892 (SD 0.119; 95% CI 0.745&#x2010;1.000), and AUC 0.925 (SD 0.112; 95% CI 0.786&#x2010;1.000), and XGBoost reaching accuracy 0.913 (SD 0.122; 95% CI 0.762&#x2010;1.000), <italic>F</italic><sub>1</sub>-score 0.899 (SD 0.138; 95% CI 0.728&#x2010;1.000), and AUC 0.906 (SD 0.137; 95% CI 0.736&#x2010;1.000). Random forest under ROSE also performed well (AUC 0.915; 95% CI 0.750&#x2010;1.000), confirming that ensemble methods benefited most from the refined feature set.</p><p>To place these gains in context, we also examined the traditional logistic regression (LGR) baseline in the same table. LGR trained in only 0.08 s for both ROSE and SMOTE variants, confirming its computational efficiency. However, its discrimination lagged behind the ML models, with AUC values of 0.755 (ROSE) and 0.728 (SMOTE). This contrast highlights a clear trade-off: while LGR offers near-instantaneous runtime, the proposed ML pipeline delivers markedly stronger predictive accuracy, justifying its added complexity for clinical applications.</p><p>To validate the statistical significance of these differences, we conducted DeLong tests for all model pairs, with detailed <italic>P</italic> values reported in Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. ROSE-based ensembles (LightGBM, XGBoost, and random forest) achieved significantly higher AUCs than all SMOTE-based models (all <italic>P</italic>&#x2264;.022), while differences among the ROSE ensembles themselves were small and often not statistically significant. In contrast, the logistic regression baseline under ROSE achieved AUC 0.755 (SD 0.050; 95% CI 0.694&#x2010;0.817), clearly lagging behind the gradient boosting models despite its very short training time. These findings confirm that combining ROSE with gradient-boosted ensembles yields the most stable and discriminative performance in the Phase 2 setting.</p><p>These findings highlight the robustness of ROSE in preserving critical data characteristics, while the selection of the top 15 features enhanced model efficiency without compromising predictive accuracy. The combination of ROSE with XGBoost and LightGBM yielded the most reliable performance, reinforcing the effectiveness of targeted feature selection and imbalance handling in MACE prediction.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>To assess cross-dataset portability and reproducibility, we applied the integrated preprocessing and modeling framework to 2 datasets with distinct data-generating mechanisms: an in-house ESRD registry (n=412) and the population-based BRFSS 2015 survey. Cross-dataset comparisons of model performance, encoding strategies, and imbalance-handling methods are summarized in <xref ref-type="fig" rid="figure6">Figures 6</xref><xref ref-type="fig" rid="figure7"/>-<xref ref-type="fig" rid="figure8">8</xref>. For each dataset, the full workflow was executed independently, including missForest imputation, one-hot, frequency, target encoding, ROSE or SMOTE resampling, model tuning, and evaluation, with no cross-dataset feature matching and no pooled AUC computations. The framework yielded improved discrimination and clinically interpretable feature-importance rankings in both settings.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Comparison of the average area under the ROC curve (AUC) of different data models. AUC: area under the ROC curve; ROSE: Random Over Sampling Example; SMOTE: Synthetic Minority Over-sampling Technique.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e75655_fig06.png"/></fig><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Comparison of the average area under the ROC curve (AUC) of different encoding methods. AUC: area under the ROC curve; ROSE: Random Over Sampling Example; SMOTE: Synthetic Minority Over-sampling Technique.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e75655_fig07.png"/></fig><fig position="float" id="figure8"><label>Figure 8.</label><caption><p>Comparison of the average area under the ROC curve (AUC) of different imbalance methods. AUC: area under the ROC curve; ROSE: Random Over Sampling Example; SMOTE: Synthetic Minority Over-sampling Technique.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e75655_fig08.png"/></fig></sec><sec id="s4-2"><title>Limitations</title><p>Our study has several constraints that warrant consideration. The ESRD cohort includes 412 patients, which limits the capture of rare patterns; multicenter expansion is ongoing to increase diversity. Despite 5-fold cross-validation, nested grid search, and an independent BRFSS run to assess cross-dataset portability, overfitting cannot be fully excluded, so temporal and multi-site validations are planned. Although the workflow performed well for cardiovascular and metabolic endpoints, its applicability to other disease domains remains uncertain and will be explored. By design, the BRFSS analysis did not involve feature harmonization, pooled AUCs, or between-dataset hypothesis tests. Finally, while ROSE outperformed SMOTE in our experiments, optimal resampling and encoding choices are likely dataset dependent, and hybrid strategies and embedding-based representations merit further investigation.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>In contrast to previous studies that tend to focus on isolated aspects of data preprocessing, our research offers a comprehensive, end-to-end framework that addresses the multifaceted challenges inherent in clinical data, namely high missingness, heterogeneity, and severe class imbalance. While earlier works typically investigated single techniques in a limited scope, our study integrates multiple advanced methods and validates the entire pipeline using cross-dataset comparisons. This holistic approach provides a stronger foundation for both improving predictive performance and facilitating clinical translation, marking a clear advancement over traditional, piecemeal methodologies.</p></sec><sec id="s4-4"><title>Conclusions</title><p>This study introduces a robust, generalizable, and interpretable data preprocessing framework for predicting MACEs among patients with ESRD. Although the improvements in performance metrics are incremental, they directly address key challenges inherent in clinical data heterogeneity and quality. The proposed pipeline enhances both predictive reliability and model transparency, offering practical value for clinical decision support and operational planning in nephrology care.</p><p>While system-level integration was not the primary focus of this work, the pipeline was deliberately designed for seamless adoption in real-world settings. By using routinely available clinical variables and lightweight computational procedures, it can be incorporated into existing analytics environments with minimal architectural adjustment. This design supports efficient deployment across diverse institutions and datasets, as evidenced by the consistent performance observed in both the ESRD registry and the BRFSS dataset. Such scalability reinforces its translational potential for broader clinical and public health applications.</p><p>Future work should prioritize cross-context application across more heterogeneous populations and evaluate the integration of deep learning architectures to further enhance predictive accuracy and extend applicability to other high-risk patient groups and chronic disease domains.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This research was supported by the National Science and Technology Council, Taiwan, under grant NSTC112-2221-E-030-009-MY3; Shin-Kong Wu Ho-Su Memorial Hospital under grant 2024SKHADR038; and Fu Jen Catholic University under grant A0114250. The authors are deeply grateful for the generous support provided by these institutions.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization, MC and C-KW; methodology, C-WC and MC; software, C-WC; validation, MC, B-CS and C-HW; formal analysis, C-WC; investigation, C-WC and C-HW; resources, C-KW, B-CS and C-WC; data curation, C-WC, C-KW; writing&#x2014;original draft preparation, C-WC and C-KW; writing&#x2014;review and editing, MC; visualization, C-WC; project administration, MC; funding acquisition, B-CS and C-HW All authors have read and agreed to the published version of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AS</term><def><p>aortic stenosis</p></def></def-item><def-item><term id="abb2">AUC</term><def><p>area under the ROC curve</p></def></def-item><def-item><term id="abb3">BRFSS</term><def><p>Behavioral Risk Factor Surveillance System</p></def></def-item><def-item><term id="abb4">CAT</term><def><p>CatBoost</p></def></def-item><def-item><term id="abb5">CXR_AoAC</term><def><p>chest X-ray for aortic arch calcification</p></def></def-item><def-item><term id="abb6">DM</term><def><p>diabetes mellitus</p></def></def-item><def-item><term id="abb7">DT</term><def><p>decision tree</p></def></def-item><def-item><term id="abb8">ESRD</term><def><p>end-stage renal disease</p></def></def-item><def-item><term id="abb9">IDH</term><def><p>intradialytic hypotension</p></def></def-item><def-item><term id="abb10">LGR</term><def><p>logistic regression</p></def></def-item><def-item><term id="abb11">LightGBM</term><def><p>light gradient-boosting machine</p></def></def-item><def-item><term id="abb12">LVH</term><def><p>left ventricular hypertrophy</p></def></def-item><def-item><term id="abb13">MACE</term><def><p>Major Adverse Cardiovascular Events</p></def></def-item><def-item><term id="abb14">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb15">OOF</term><def><p>out-of-fold</p></def></def-item><def-item><term id="abb16">PAOD</term><def><p>peripheral arterial occlusive disease</p></def></def-item><def-item><term id="abb17">ROS</term><def><p>Random Over-Sampling</p></def></def-item><def-item><term id="abb18">ROSE</term><def><p>Random Over-Sampling Examples</p></def></def-item><def-item><term id="abb19">SMOTE</term><def><p>Synthetic Minority Over-sampling Technique</p></def></def-item><def-item><term id="abb20">XGB</term><def><p>extreme gradient boosting (XGBoost)</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Olisah</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>L</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>M</given-names> </name></person-group><article-title>Diabetes mellitus prediction and diagnosis from a data preprocessing and machine learning perspective</article-title><source>Comput Methods Programs Biomed</source><year>2022</year><month>06</month><volume>220</volume><fpage>106773</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2022.106773</pub-id><pub-id pub-id-type="medline">35429810</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Pfohl</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Fries</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Systematic review of approaches to preserve machine learning performance in the presence of temporal dataset shift in clinical medicine</article-title><source>Appl Clin Inform</source><year>2021</year><month>08</month><volume>12</volume><issue>4</issue><fpage>808</fpage><lpage>815</lpage><pub-id pub-id-type="doi">10.1055/s-0041-1735184</pub-id><pub-id pub-id-type="medline">34470057</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Anderson</surname><given-names>K</given-names> </name></person-group><article-title>The role of data preprocessing in machine learning accuracy for heart disease prediction hybrid models for heart disease prediction: combining neural networks with traditional</article-title><year>2024</year><access-date>2026-03-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.researchgate.net/publication/386177388_The_Role_of_Data_Preprocessing_in_Machine_Learning_Accuracy_for_Heart_Disease_Prediction_Hybrid_Models_for_Heart_Disease_Prediction_Combining_Neural_Networks_with_Traditional">https://www.researchgate.net/publication/386177388_The_Role_of_Data_Preprocessing_in_Machine_Learning_Accuracy_for_Heart_Disease_Prediction_Hybrid_Models_for_Heart_Disease_Prediction_Combining_Neural_Networks_with_Traditional</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rahman</surname><given-names>A</given-names> </name></person-group><article-title>Statistics-based data preprocessing methods and machine learning algorithms for big data analysis</article-title><source>Int J Artif Intell</source><year>2019</year><access-date>2026-03-30</access-date><volume>17</volume><fpage>44</fpage><lpage>65</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://www.ceser.in/ceserp/index.php/ijai/article/view/6253">http://www.ceser.in/ceserp/index.php/ijai/article/view/6253</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Galal</surname><given-names>G</given-names> </name><name name-style="western"><surname>Etemadi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vaidyanathan</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation and mitigation of racial bias in clinical machine learning models: scoping review</article-title><source>JMIR Med Inform</source><year>2022</year><month>05</month><day>31</day><volume>10</volume><issue>5</issue><fpage>e36388</fpage><pub-id pub-id-type="doi">10.2196/36388</pub-id><pub-id pub-id-type="medline">35639450</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hassler</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Menasalvas</surname><given-names>E</given-names> </name><name name-style="western"><surname>Garc&#x00ED;a-Garc&#x00ED;a</surname><given-names>FJ</given-names> </name><name name-style="western"><surname>Rodr&#x00ED;guez-Ma&#x00F1;as</surname><given-names>L</given-names> </name><name name-style="western"><surname>Holzinger</surname><given-names>A</given-names> </name></person-group><article-title>Importance of medical data preprocessing in predictive modeling and risk factor discovery for the frailty syndrome</article-title><source>BMC Med Inform Decis Mak</source><year>2019</year><month>02</month><day>18</day><volume>19</volume><issue>1</issue><fpage>33</fpage><pub-id pub-id-type="doi">10.1186/s12911-019-0747-6</pub-id><pub-id pub-id-type="medline">30777059</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A machine-learning-based prediction method for hypertension outcomes based on medical data</article-title><source>Diagnostics (Basel)</source><year>2019</year><month>11</month><day>7</day><volume>9</volume><issue>4</issue><fpage>178</fpage><pub-id pub-id-type="doi">10.3390/diagnostics9040178</pub-id><pub-id pub-id-type="medline">31703364</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeong</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Comparison between statistical models and machine learning methods on classification for highly imbalanced multiclass kidney data</article-title><source>Diagnostics (Basel)</source><year>2020</year><month>06</month><day>18</day><volume>10</volume><issue>6</issue><fpage>415</fpage><pub-id pub-id-type="doi">10.3390/diagnostics10060415</pub-id><pub-id pub-id-type="medline">32570782</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sidey-Gibbons</surname><given-names>JAM</given-names> </name><name name-style="western"><surname>Sidey-Gibbons</surname><given-names>CJ</given-names> </name></person-group><article-title>Machine learning in medicine: a practical introduction</article-title><source>BMC Med Res Methodol</source><year>2019</year><month>03</month><day>19</day><volume>19</volume><issue>1</issue><fpage>64</fpage><pub-id pub-id-type="doi">10.1186/s12874-019-0681-4</pub-id><pub-id pub-id-type="medline">30890124</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garc&#x00ED;a-Vicente</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chushig-Muzo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mora-Jim&#x00E9;nez</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Evaluation of synthetic categorical data generation techniques for predicting cardiovascular diseases and post-hoc interpretability of the risk factors</article-title><source>Appl Sci (Basel)</source><year>2023</year><volume>13</volume><issue>7</issue><fpage>4119</fpage><pub-id pub-id-type="doi">10.3390/app13074119</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khushi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shaukat</surname><given-names>K</given-names> </name><name name-style="western"><surname>Alam</surname><given-names>TM</given-names> </name><etal/></person-group><article-title>A comparative performance analysis of data resampling methods on imbalance medical data</article-title><source>IEEE Access</source><year>2021</year><volume>9</volume><fpage>109960</fpage><lpage>109975</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2021.3102399</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ning</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Deep learning for temporal data representation in electronic health records: a systematic review of challenges and methodologies</article-title><source>J Biomed Inform</source><year>2022</year><month>02</month><volume>126</volume><fpage>103980</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2021.103980</pub-id><pub-id pub-id-type="medline">34974189</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Naghipour</surname><given-names>A</given-names> </name><name name-style="western"><surname>Abbaszadeh Bavil Soflaei</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Ghader-zefrehei</surname><given-names>M</given-names> </name></person-group><article-title>Machine learning classifiers and data synthesis techniques to tackle with highly imbalanced COVID-19 data</article-title><source>Comput Knowl Eng</source><year>2024</year><volume>7</volume><fpage>51</fpage><lpage>64</lpage><pub-id pub-id-type="doi">10.22067/cke.2024.88940.1121</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pargent</surname><given-names>F</given-names> </name><name name-style="western"><surname>Pfisterer</surname><given-names>F</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bischl</surname><given-names>B</given-names> </name></person-group><article-title>Regularized target encoding outperforms traditional methods in supervised machine learning with high cardinality features</article-title><source>Comput Stat</source><year>2022</year><month>11</month><volume>37</volume><issue>5</issue><fpage>2671</fpage><lpage>2692</lpage><pub-id pub-id-type="doi">10.1007/s00180-022-01207-6</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>KK</given-names> </name></person-group><article-title>Missing data preprocessing in credit classification: one-hot encoding or imputation?</article-title><source>Emerg Mark Finance Trade</source><year>2022</year><month>01</month><day>26</day><volume>58</volume><issue>2</issue><fpage>472</fpage><lpage>482</lpage><pub-id pub-id-type="doi">10.1080/1540496X.2020.1825935</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kivrak</surname><given-names>M</given-names> </name><name name-style="western"><surname>Avci</surname><given-names>U</given-names> </name><name name-style="western"><surname>Uzun</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ardic</surname><given-names>C</given-names> </name></person-group><article-title>The impact of the SMOTE method on machine learning and ensemble learning performance results in addressing class imbalance in data used for predicting total testosterone deficiency in type 2 diabetes patients</article-title><source>Diagnostics (Basel)</source><year>2024</year><month>11</month><day>22</day><volume>14</volume><issue>23</issue><fpage>2634</fpage><pub-id pub-id-type="doi">10.3390/diagnostics14232634</pub-id><pub-id pub-id-type="medline">39682541</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chao</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>CK</given-names> </name></person-group><article-title>Left ventricular hypertrophy geometry and vascular calcification co-modify the risk of cardiovascular mortality in patients with end-stage kidney disease: a retrospective cohort study</article-title><source>J Atheroscler Thromb</source><year>2023</year><month>09</month><day>1</day><volume>30</volume><issue>9</issue><fpage>1242</fpage><lpage>1254</lpage><pub-id pub-id-type="doi">10.5551/jat.63870</pub-id><pub-id pub-id-type="medline">36567124</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chao</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>CK</given-names> </name></person-group><article-title>Combinations of valvular calcification and serum alkaline phosphatase predict cardiovascular risk among end-stage kidney disease patients</article-title><source>IJC Heart Vasculature</source><year>2024</year><month>10</month><volume>54</volume><fpage>101505</fpage><pub-id pub-id-type="doi">10.1016/j.ijcha.2024.101505</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stekhoven</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>B&#x00FC;hlmann</surname><given-names>P</given-names> </name></person-group><article-title>MissForest--non-parametric missing value imputation for mixed-type data</article-title><source>Bioinformatics</source><year>2012</year><month>01</month><day>1</day><volume>28</volume><issue>1</issue><fpage>112</fpage><lpage>118</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btr597</pub-id><pub-id pub-id-type="medline">22039212</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waljee</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Mukherjee</surname><given-names>A</given-names> </name><name name-style="western"><surname>Singal</surname><given-names>AG</given-names> </name><etal/></person-group><article-title>Comparison of imputation methods for missing laboratory data in medicine</article-title><source>BMJ Open</source><year>2013</year><month>08</month><day>1</day><volume>3</volume><issue>8</issue><fpage>e002847</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2013-002847</pub-id><pub-id pub-id-type="medline">23906948</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Uyar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bener</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ciray</surname><given-names>HN</given-names> </name><name name-style="western"><surname>Bahceci</surname><given-names>M</given-names> </name></person-group><article-title>A frequency based encoding technique for transformation of categorical variables in mixed IVF dataset</article-title><source>Annu Int Conf IEEE Eng Med Biol Soc</source><year>2009</year><volume>2009</volume><fpage>6214</fpage><lpage>6217</lpage><pub-id pub-id-type="doi">10.1109/IEMBS.2009.5334548</pub-id><pub-id pub-id-type="medline">19964898</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lunardon</surname><given-names>N</given-names> </name><name name-style="western"><surname>Menardi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Torelli</surname><given-names>N</given-names> </name></person-group><article-title>ROSE: a package for binary imbalanced learning</article-title><source>R J</source><year>2014</year><volume>6</volume><issue>1</issue><fpage>79</fpage><pub-id pub-id-type="doi">10.32614/RJ-2014-008</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Menardi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Torelli</surname><given-names>N</given-names> </name></person-group><article-title>Training and assessing classification rules with imbalanced data</article-title><source>Data Min Knowl Disc</source><year>2014</year><month>01</month><volume>28</volume><issue>1</issue><fpage>92</fpage><lpage>122</lpage><pub-id pub-id-type="doi">10.1007/s10618-012-0295-5</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chawla</surname><given-names>NV</given-names> </name><name name-style="western"><surname>Bowyer</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>LO</given-names> </name><name name-style="western"><surname>Kegelmeyer</surname><given-names>WP</given-names> </name></person-group><article-title>SMOTE: synthetic minority over-sampling technique</article-title><source>JAIR</source><year>2002</year><volume>16</volume><fpage>321</fpage><lpage>357</lpage><pub-id pub-id-type="doi">10.1613/jair.953</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kamalov</surname><given-names>F</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>HH</given-names> </name><name name-style="western"><surname>Cherukuri</surname><given-names>AK</given-names> </name></person-group><article-title>Keep it simple: random oversampling for imbalanced data</article-title><year>2023</year><conf-name>2023 Advances in Science and Engineering Technology International Conferences (ASET)</conf-name><conf-date>Feb 20-23, 2023</conf-date><conf-loc>Dubai, United Arab Emirates</conf-loc><fpage>1</fpage><lpage>4</lpage><pub-id pub-id-type="doi">10.1109/ASET56582.2023.10180891</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gnip</surname><given-names>P</given-names> </name><name name-style="western"><surname>Vokorokos</surname><given-names>L</given-names> </name><name name-style="western"><surname>Drot&#x00E1;r</surname><given-names>P</given-names> </name></person-group><article-title>Selective oversampling approach for strongly imbalanced data</article-title><source>PeerJ Comput Sci</source><year>2021</year><volume>7</volume><fpage>e604</fpage><pub-id pub-id-type="doi">10.7717/peerj-cs.604</pub-id><pub-id pub-id-type="medline">34239981</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schumacher</surname><given-names>P</given-names> </name><name name-style="western"><surname>Olinsky</surname><given-names>A</given-names> </name><name name-style="western"><surname>Quinn</surname><given-names>J</given-names> </name></person-group><article-title>Effects of resampling techniques on imbalanced data classification: a new under-resampling method</article-title><source>Advances in Business and Management Forecasting</source><year>2021</year><publisher-name>Emerald Publishing Limited</publisher-name><fpage>51</fpage><lpage>70</lpage><pub-id pub-id-type="doi">10.1108/S1477-407020210000014005</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Berrar</surname><given-names>D</given-names> </name></person-group><article-title>Cross-validation</article-title><source>Reference Module in Life Sciences</source><year>2019</year><publisher-name>Elsevier</publisher-name><fpage>542</fpage><lpage>545</lpage><pub-id pub-id-type="doi">10.1016/B978-0-12-809633-8.20349-X</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Therneau</surname><given-names>T</given-names> </name><name name-style="western"><surname>Atkinson</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ripley</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ripley</surname><given-names>MB</given-names> </name></person-group><article-title>Package &#x2018;rpart</article-title><source>rpart: Recursive Partitioning and Regression Trees</source><year>2015</year><access-date>2026-03-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/web/packages/rpart/rpart.pdf">https://cran.r-project.org/web/packages/rpart/rpart.pdf</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>GJ</given-names> </name></person-group><article-title>Rattle: a data mining GUI for R</article-title><source>R J</source><year>2009</year><volume>1</volume><issue>2</issue><fpage>45</fpage><pub-id pub-id-type="doi">10.32614/RJ-2009-016</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Random Forests</article-title><source>Mach Learn</source><year>2001</year><month>10</month><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>He</surname><given-names>T</given-names> </name><name name-style="western"><surname>Benesty</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Xgboost: extreme gradient boosting. r package version 0.4-2</article-title><year>2015</year><access-date>2026-03-30</access-date><volume>1</volume><publisher-name>Tianqi Chen</publisher-name><fpage>1</fpage><lpage>4</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/web/packages/xgboost/index.html">https://cran.r-project.org/web/packages/xgboost/index.html</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dorogush</surname><given-names>AV</given-names> </name><name name-style="western"><surname>Ershov</surname><given-names>V</given-names> </name><name name-style="western"><surname>Gulin</surname><given-names>A</given-names> </name></person-group><article-title>CatBoost: gradient boosting with categorical features support</article-title><source>arXiv</source><access-date>2026-03-30</access-date><comment>Preprint posted online on  Oct 24, 2018</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1810.11363">https://arxiv.org/abs/1810.11363</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.11363</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ke</surname><given-names>G</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Finley</surname><given-names>T</given-names> </name><etal/></person-group><article-title>LightGBM: a highly efficient gradient boosting decision tree</article-title><source>Adv Neural Inf Process Syst</source><year>2017</year><access-date>2026-03-30</access-date><fpage>30</fpage><comment><ext-link ext-link-type="uri" xlink:href="https://papers.nips.cc/paper_files/paper/2017/hash/6449f44a102fde848669bdd9eb6b76fa-Abstract.html">https://papers.nips.cc/paper_files/paper/2017/hash/6449f44a102fde848669bdd9eb6b76fa-Abstract.html</ext-link></comment><pub-id pub-id-type="doi">10.5555/3294996.3295074</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuhn</surname><given-names>M</given-names> </name></person-group><article-title>Building predictive models in R using the caret package</article-title><source>J Stat Soft</source><access-date>2026-03-30</access-date><volume>28</volume><issue>5</issue><comment><ext-link ext-link-type="uri" xlink:href="https://www.jstatsoft.org/article/view/v028i05">https://www.jstatsoft.org/article/view/v028i05</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Robin</surname><given-names>X</given-names> </name><name name-style="western"><surname>Turck</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hainard</surname><given-names>A</given-names> </name><etal/></person-group><article-title>pROC: an open-source package for R and S+ to analyze and compare ROC curves</article-title><source>BMC Bioinformatics</source><year>2011</year><month>03</month><day>17</day><volume>12</volume><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1186/1471-2105-12-77</pub-id><pub-id pub-id-type="medline">21414208</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name></person-group><article-title>Pathogenesis and mechanism of uremic vascular calcification</article-title><source>Cureus</source><year>2024</year><month>07</month><volume>16</volume><issue>7</issue><fpage>e64771</fpage><pub-id pub-id-type="doi">10.7759/cureus.64771</pub-id><pub-id pub-id-type="medline">39026575</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mace</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Gravesen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nordholm</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The calcified vasculature in chronic kidney disease secretes factors that inhibit bone mineralization</article-title><source>JBMR Plus</source><year>2022</year><month>04</month><volume>6</volume><issue>4</issue><fpage>e10610</fpage><pub-id pub-id-type="doi">10.1002/jbm4.10610</pub-id><pub-id pub-id-type="medline">35434452</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chao</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>SH</given-names> </name></person-group><article-title>Uremic vascular calcification: the pathogenic roles and gastrointestinal decontamination of uremic toxins</article-title><source>Toxins (Basel)</source><year>2020</year><month>12</month><day>21</day><volume>12</volume><issue>12</issue><fpage>812</fpage><pub-id pub-id-type="doi">10.3390/toxins12120812</pub-id><pub-id pub-id-type="medline">33371477</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>YC</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>CK</given-names> </name></person-group><article-title>Effect of the geometry and severity of left ventricular hypertrophy on cardiovascular mortality in dialysis patients</article-title><source>Kidney Res Clin Pract</source><year>2024</year><month>09</month><day>11</day><pub-id pub-id-type="doi">10.23876/j.krcp.23.290</pub-id><pub-id pub-id-type="medline">39384352</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tomura</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hamasaki</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Komaru</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Prognostic significance of concentric left ventricular hypertrophy at peritoneal dialysis initiation</article-title><source>BMC Nephrol</source><year>2021</year><month>04</month><day>16</day><volume>22</volume><issue>1</issue><fpage>135</fpage><pub-id pub-id-type="doi">10.1186/s12882-021-02321-1</pub-id><pub-id pub-id-type="medline">33863299</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Poznyak</surname><given-names>AV</given-names> </name><name name-style="western"><surname>Sadykhov</surname><given-names>NK</given-names> </name><name name-style="western"><surname>Kartuesov</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Borisov</surname><given-names>EE</given-names> </name><name name-style="western"><surname>Sukhorukov</surname><given-names>VN</given-names> </name><name name-style="western"><surname>Orekhov</surname><given-names>AN</given-names> </name></person-group><article-title>Atherosclerosis specific features in chronic kidney disease (CKD)</article-title><source>Biomedicines</source><year>2022</year><month>08</month><day>27</day><volume>10</volume><issue>9</issue><fpage>2094</fpage><pub-id pub-id-type="doi">10.3390/biomedicines10092094</pub-id><pub-id pub-id-type="medline">36140195</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lacount</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tannock</surname><given-names>LR</given-names> </name></person-group><article-title>Dyslipidemia in chronic kidney disease</article-title><source>Endotext</source><year>2025</year><pub-id pub-id-type="medline">26247091</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Curaj</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vanholder</surname><given-names>R</given-names> </name><name name-style="western"><surname>Loscalzo</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Cardiovascular consequences of uremic metabolites: an overview of the involved signaling pathways</article-title><source>Circ Res</source><year>2024</year><month>03</month><volume>134</volume><issue>5</issue><fpage>592</fpage><lpage>613</lpage><pub-id pub-id-type="doi">10.1161/CIRCRESAHA.123.324001</pub-id><pub-id pub-id-type="medline">38422175</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smilowitz</surname><given-names>NR</given-names> </name><name name-style="western"><surname>Bhandari</surname><given-names>N</given-names> </name><name name-style="western"><surname>Berger</surname><given-names>JS</given-names> </name></person-group><article-title>Chronic kidney disease and outcomes of lower extremity revascularization for peripheral artery disease</article-title><source>Atherosclerosis</source><year>2020</year><month>03</month><volume>297</volume><fpage>149</fpage><lpage>156</lpage><pub-id pub-id-type="doi">10.1016/j.atherosclerosis.2019.12.016</pub-id><pub-id pub-id-type="medline">31948675</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Su</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lv</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name></person-group><article-title>Effect of antiplatelet therapy on cardiovascular and kidney outcomes in patients with chronic kidney disease: a systematic review and meta-analysis</article-title><source>BMC Nephrol</source><year>2019</year><month>08</month><day>7</day><volume>20</volume><issue>1</issue><fpage>309</fpage><pub-id pub-id-type="doi">10.1186/s12882-019-1499-3</pub-id><pub-id pub-id-type="medline">31390997</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sars</surname><given-names>B</given-names> </name><name name-style="western"><surname>van der Sande</surname><given-names>FM</given-names> </name><name name-style="western"><surname>Kooman</surname><given-names>JP</given-names> </name></person-group><article-title>Intradialytic hypotension: mechanisms and outcome</article-title><source>Blood Purif</source><year>2020</year><volume>49</volume><issue>1-2</issue><fpage>158</fpage><lpage>167</lpage><pub-id pub-id-type="doi">10.1159/000503776</pub-id><pub-id pub-id-type="medline">31851975</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stein</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Fearon</surname><given-names>WF</given-names> </name><name name-style="western"><surname>Elmariah</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Left ventricular hypertrophy and biomarkers of cardiac damage and stress in aortic stenosis</article-title><source>J Am Heart Assoc</source><year>2022</year><month>04</month><day>5</day><volume>11</volume><issue>7</issue><fpage>e023466</fpage><pub-id pub-id-type="doi">10.1161/JAHA.121.023466</pub-id><pub-id pub-id-type="medline">35301869</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary tables and figures including hyperparameter grid search settings, DeLong test <italic>P</italic> value matrices, dataset-specific SMOTE/ROSE settings and random seeds, LightGBM feature importance, and the leakage-safe cross-validation workflow.</p><media xlink:href="medinform_v14i1e75655_app1.docx" xlink:title="DOCX File, 39 KB"/></supplementary-material></app-group></back></article>