<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e78931</article-id><article-id pub-id-type="doi">10.2196/78931</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Advancing Gastrointestinal Cancer Risk Prediction With Patient-Centered Machine Learning: Machine Learning Modeling Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Baublyte</surname><given-names>Daina</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Jeonghee</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gunathilake</surname><given-names>Madhawa</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Kim</surname><given-names>Jeongseon</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Public Health &#x0026; AI, National Cancer Center Graduate School of Cancer Science and Policy, National Cancer Center</institution><addr-line>Goyang-si</addr-line><country>Republic of Korea</country></aff><aff id="aff2"><institution>Department of Cancer Biomedical Science, National Cancer Center Graduate School of Cancer Science and Policy, National Cancer Center</institution><addr-line>323 Ilsan-ro, Ilsandong-gu, Gyeonggi-do</addr-line><addr-line>Goyang-si</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Senst</surname><given-names>Benjamin</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mpofu</surname><given-names>Rephaim</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Perepu</surname><given-names>Sireesha</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jeongseon Kim, PhD, Department of Cancer Biomedical Science, National Cancer Center Graduate School of Cancer Science and Policy, National Cancer Center, 323 Ilsan-ro, Ilsandong-gu, Gyeonggi-do, Goyang-si, 10408, Republic of Korea, 82 31-920-2570, 82 31-920-2579; <email>jskim@ncc.re.kr</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>4</day><month>6</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e78931</elocation-id><history><date date-type="received"><day>12</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>09</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>13</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Daina Baublyte, Jeonghee Lee, Madhawa Gunathilake, Jeongseon Kim. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 4.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e78931"/><abstract><sec><title>Background</title><p>Gastrointestinal (GI) cancers are a significant health concern in South Korea. Recently, machine learning (ML) models have emerged as powerful tools to support early screening efforts and identify people at risk before disease onset. However, the low incidence of GI malignancies in prospective cohorts leads to severe class imbalance, often causing ML models to favor the majority &#x201C;healthy&#x201D; class at the expense of clinical sensitivity.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate class imbalance mitigation strategies and develop ML-based GI cancer risk prediction models using noninvasive and minimally invasive predictors linked to modifiable behavioral and metabolic risk factors.</p></sec><sec sec-type="methods"><title>Methods</title><p>We analyzed a prospective cohort (n=7652) with 156 incident GI cancer cases (2%) identified over a 14-year follow-up period. The data were randomly split into training (5356/7652, 70%) and testing (2296/7652, 30%) sets. To address class imbalance while preserving observed population structure, we developed a patient-centered undersampling technique (PCUSTe) based on the logic of frequency-matched case-control studies. PCUSTe was compared with commonly used resampling approaches, including synthetic minority oversampling (SMOTE), adaptive synthetic sampling (ADASYN), and SMOTE with edited nearest neighbors (ENN). Six classifiers were implemented, including both batch and incremental training variants. To account for the prior shift introduced by resampling, probability correction was applied. Model performance was evaluated on the independent test set using a classification threshold equal to the observed event proportion (cumulative incidence) in the training data and then across thresholds reflecting incidence values between 1% and 5%. Primary performance metrics included sensitivity, specificity, Matthews correlation coefficient, and area under the receiver operating characteristic curve (AUC).</p></sec><sec sec-type="results"><title>Results</title><p>Models trained using PCUSTe demonstrated improved sensitivity compared with standard resampling techniques, particularly for more complex classifiers. The incrementally trained stochastic gradient descent model achieved the highest overall performance trained on PCUSTe data with a sensitivity of 0.77 (95% CI 0.64&#x2010;0.89), specificity of 0.65 (95% CI 0.63&#x2010;0.67), AUC of 0.77 (95% CI 0.70&#x2010;0.84), and Matthews correlation coefficient of 0.12 (95% CI 0.08&#x2010;0.16). In contrast, logistic regression achieved balanced performance without resampling (sensitivity 0.70, 95% CI 0.57&#x2010;0.83; specificity 0.71, 95% CI 0.69&#x2010;0.72; AUC 0.75, 95% CI 0.68&#x2010;0.82). Our results showed that PCUSTe primarily enhanced sensitivity in more complex models at the expense of specificity.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Integrating epidemiological principles, including covariate frequency matching and threshold selection based on the observed cumulative incidence in the training data, improved minority class detection in GI cancer risk prediction. However, model performance varied by algorithm, and in some cases, decision threshold adjustment alone achieved comparable or superior results to data resampling. These findings highlight the importance of carefully selecting imbalance mitigation strategies based on modeling objectives. The resulting models achieved sensitivity levels that may be suitable for early risk identification in cohort settings and could contribute to personalized risk stratification and targeted prevention or screening strategies.</p></sec></abstract><kwd-group><kwd>gastrointestinal cancer</kwd><kwd>class imbalance</kwd><kwd>machine learning</kwd><kwd>cohort study</kwd><kwd>data resampling</kwd><kwd>cancer risk prediction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Gastrointestinal (GI) cancers pose a significant global health burden, with nearly 5 million new cases and over 3 million deaths reported in 2022 [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. The impact is particularly severe in Asia, where GI cancers account for nearly 30% of all cancer cases, largely due to lifestyle habits, dietary factors, <italic>Helicobacter pylori</italic> infections, and genetic predispositions [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. These trends underscore the urgent need for region-specific prevention and early detection strategies.</p><p>Predictive models play a critical role in identifying high-risk individuals and enabling targeted preventive measures [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. However, accurate risk prediction requires pre-diagnosis patient data, which is best obtained from the prospective cohort studies&#x2014;longitudinal studies that track healthy individuals over extended periods. Compared with other observational designs, prospective cohorts are less susceptible to selection bias because they minimize reliance on recall and are robust to differential survival. Although they do not guarantee perfect generalizability, they typically provide a closer approximation of the true population. These studies offer high-quality, temporally structured data, but their high cost and resource-intensive nature limit their availability. Moreover, many potential risk factors involved in disease development exhibit complex, nonlinear relationships, posing additional challenges for traditional analytic approaches.</p><p>In this context, advances in machine learning (ML) offer a promising solution. With the increasing availability of large-scale health datasets, ML has emerged as a powerful tool for disease risk prediction, offering advantages over conventional statistical methods in modeling complex, multidimensional data [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. ML-driven models can uncover nonlinear interactions among risk factors, improving predictive performance in early cancer detection. Although the application of ML methods has demonstrated promising results in GI cancer risk prediction using biomarkers, anthropometric measures, and socioeconomic data, its application remains limited [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>One of the major challenges in applying ML to GI cancer risk prediction is the severe class imbalance inherent in cohort studies, where cancer cases constitute only a small fraction of the population. This imbalance skews model performance, leading to high accuracy for the majority class (noncancer) but poor sensitivity for cancer cases [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. Additionally, integrating diverse risk factors&#x2014;such as biomarkers, dietary intake, and lifestyle data&#x2014;compounds this challenge, as these variables often exhibit high variability and sparse representation. Given the heterogeneous nature of GI cancers, incorporating a wider range of risk factors may support the development of more effective screening and prevention strategies [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>While previous studies have been conducted for GI cancer risk prediction, many have relied on relatively balanced datasets from cross-sectional studies, narrow predictor sets, or traditional statistical models, which may not fully capture the complex, nonlinear interactions among diverse risk factors [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. To address these limitations, this study used a diverse set of predictors and focused on risk prediction in highly imbalanced cohort data.</p><p>To address class imbalance in ML-based GI cancer risk prediction, we evaluated multiple imbalance mitigation strategies, including a patient-centered undersampling technique (PCUSTe) grounded in epidemiological principles of frequency-based case-control matching. PCUSTe was designed to preserve observed population structure in small, highly imbalanced cohorts and was compared with established resampling approaches, including synthetic minority oversampling (SMOTE), adaptive synthetic sampling (ADASYN), and hybrid resampling methods [<xref ref-type="bibr" rid="ref26">26</xref>]. In addition to data-level strategies, we incorporated probability correction for models trained on resampled data and decision threshold adjustment based on observed cumulative incidence. Model behavior and performance were systematically compared across imbalance mitigation strategies using multiple ML algorithms and hyperparameter settings to examine their influence on GI cancer risk prediction in highly imbalanced cohort data.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Participants</title><p>This study used data from 12,552 South Korean adults enrolled in the Korea National Cancer Center (KNCC) Screenee Cohort, a longitudinal study initiated in 2002 by the National Cancer Center in South Korea [<xref ref-type="bibr" rid="ref27">27</xref>]. To ensure data quality and minimize potential bias, several exclusion criteria were applied. Participants were excluded if they had a prior cancer diagnosis (n=1051), were diagnosed with GI cancer within 6 months of follow-up (n=28), or reported implausible total energy intake (&#x003C;500 or &#x003E;4000 kcal/day), which may indicate dietary misreporting or data entry errors (n=187). Participants with missing or incomplete records were also excluded, and a complete-case analysis was conducted (n=3251). In addition, controls with other severe conditions&#x2014;including non-GI cancers or extreme biomarker values suggestive of acute metabolic or hepatic abnormalities&#x2014;were excluded. These abnormalities were defined as triglycerides, aspartate aminotransferase, or gamma-glutamyl transferase levels exceeding 1000, or fasting glucose levels below 55 or above 200 mg/dL (n=383) [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. Participant selection and exclusion steps are illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>Participants were followed for up to 14 years (2007&#x2010;2021). GI cancers were defined according to the <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) as follows: esophageal cancer (C15), gastric cancer (C16), small intestine cancer (C17), colorectal cancer (C18-C20), anal cancer (C21), liver cancer (C22), gallbladder cancer (C23-C24), and pancreatic cancer (C25). Individuals who developed GI cancer during the follow-up period were classified as cases, while those who remained cancer-free served as controls. Based on this labeling, a supervised ML framework was constructed to address a binary classification task&#x2014;identifying individuals at risk for developing GI cancer.</p><p>The final dataset consisted of 7652 participants (61% of the original cohort), including 156 (2.0%) incident GI cancer cases and 7496 (98.0%) controls. Some participants developed malignancies at multiple GI sites, resulting in a total of 162 cancer diagnoses. The most frequently diagnosed cancers were gastric (62/162, 38.3%), colorectal (43/162, 26.5%), and hepatic (30/162, 18.5%).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the study participant selection. GI: gastrointestinal; KNCC: Korea National Cancer Center.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e78931_fig01.png"/></fig></sec><sec id="s2-2"><title>Data Collection and Preprocessing</title><p>All participants in the KNCC Screenee Cohort completed self-administered questionnaires capturing socio-demographic characteristics and lifestyle behaviors. Additionally, data were collected on anthropometric measurements, clinical biomarkers, and dietary intake. Dietary intake was assessed at baseline using a validated 106-item semiquantitative food frequency questionnaire [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>].</p><p>To adjust for total energy intake&#x2019;s influence on nutrient intake, we applied the residual method in Python (version 3.12) [<xref ref-type="bibr" rid="ref33">33</xref>]. Categorical variables, such as lifestyle habits, gender, and socioeconomic characteristics, were binarized, and low-density lipoproteins were calculated using Friedewald&#x2019;s equation [<xref ref-type="bibr" rid="ref34">34</xref>].</p><p>The dataset was randomly split into a training set (5356/7652, 70%) and a test set (2296/7652, 30%). The training set included 109 cases and 5247 controls, while the test set consisted of 47 cases and 2249 controls. Continuous variables were standardized using an adapted Z-score normalization based on the control population. For each predictor, the mean and standard deviation were calculated using only control individuals in the training set, and these parameters were applied to transform both cases and controls in the training data. The same parameters were then applied to the corresponding test data.</p></sec><sec id="s2-3"><title>Ethical Considerations</title><p>This study was conducted as a secondary analysis of existing cohort data from participants who had previously provided broad informed consent for future research use at enrollment in the KNCC Screenee Cohort. The study was noninterventional and did not affect participant care. Personal identifiers were used solely for pseudonymized linkage with national cancer registry data by an authorized data linkage institution and were removed immediately thereafter. Only deidentified data were accessed by authorized research personnel within a secure analysis environment. Given the use of deidentified data and the absence of participant contact or intervention, the risk to participants was considered minimal, and no additional informed consent, compensation, or participant support was required. This study was reviewed and approved by the Institutional Review Board of the National Cancer Center, Korea (approval number NCC2024-0106).</p></sec><sec id="s2-4"><title>Statistical Analysis</title><p>Descriptive statistics were generated using Python (version 3.12) following the energy adjustment of nutrient intakes. To ensure the representativeness of the study population and identify potential selection or partitioning bias, baseline characteristics were compared across several cohorts: the final analyzed cohort, individuals excluded due to missing data, and the training and test data splits. Continuous variables were summarized as means and standard deviations and evaluated using Welch <italic>t</italic> test to account for potential unequal variances. Categorical variables were expressed as frequencies and percentages, with differences assessed using Pearson chi-square tests [<xref ref-type="bibr" rid="ref35">35</xref>]. All statistical tests were 2-sided, and a <italic>P</italic> value &#x003C;.05 was considered to indicate statistical significance.</p></sec><sec id="s2-5"><title>Sample Size Considerations</title><p>In the absence of a universally accepted criterion for determining sample size adequacy in ML prediction models, we adopted the event-per-variable (EPV) framework from regression modeling as a conservative reference. Although an EPV of &#x2265;10 has traditionally been recommended for logistic regression (LR), recent evidence suggests that lower EPV values may be acceptable when penalization and shrinkage techniques are applied [<xref ref-type="bibr" rid="ref36">36</xref>]. With 109 GI cancer cases and 20 selected predictors, the EPV for tuned models in our study was 5.45:</p><disp-formula id="E1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>E</mml:mi><mml:mi>P</mml:mi><mml:mi>V</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>N</mml:mi><mml:mi>u</mml:mi><mml:mi>m</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>E</mml:mi><mml:mi>v</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mi>u</mml:mi><mml:mi>m</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mn>109</mml:mn><mml:mn>20</mml:mn></mml:mfrac><mml:mo>=</mml:mo><mml:mn>5.45</mml:mn></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>In addition to tuned configurations, we evaluated baseline models using all 34 available predictors without hyperparameter tuning or feature selection. This approach was adopted to examine whether systematic tuning materially altered predictive performance, given that default parameter settings may perform comparably to, or occasionally outperform, extensively tuned models. In these baseline configurations, the nominal EPV decreased below the selected threshold.</p><p>All evaluated models incorporated model-specific regularization or complexity control mechanisms. To further assess robustness, we examined model stability across a range of EPV values from 9.9 (11 predictors) to 3.2 (34 predictors). These analyses were conducted to evaluate stability across model complexities rather than to infer optimal predictive performance.</p></sec><sec id="s2-6"><title>Data Resampling Strategies</title><sec id="s2-6-1"><title>Algorithm Selection</title><p>Resampling methods are commonly used to address class imbalance and can be broadly categorized as oversampling or undersampling. Oversampling enlarges the minority class by generating synthetic observations, which may improve sensitivity but can introduce synthetic noise and overfitting. Undersampling, in contrast, reduces the majority class size to preserve data authenticity but risks discarding informative samples and distorting population structure. To achieve a better balance between data integrity and population representativeness, we developed PCUSTe, a patient-centered undersampling method.</p></sec><sec id="s2-6-2"><title>Patient-Centered Undersampling Method</title><p>Similar to other undersampling methods, PCUSTe addresses class imbalance without generating synthetic data. However, unlike conventional methods that randomly remove majority-class samples or rely on geometric distances in feature space, PCUSTe uses case distribution guided sampling based on categorical matching covariates. Controls are selected in proportion to the empirical distribution of cases across these strata, ensuring that the resulting training data preserve the covariate composition of the study population. This approach integrates the design logic of frequency-matched case-control studies into ML preprocessing context, thereby minimizing distributional bias introduced by random or distance-based undersampling.</p><p>The method is parameterized by the case-to-control ratio (1:1 in this study, though extendable) and the covariates guiding sampling. Random seeds can be specified to ensure reproducibility. Finally, the combined dataset is shuffled to prevent ordering bias. In this study, we tested 2 parameterizations: PCUSTe-1 (PCUSTe with sociodemographic matching [education, employment, income, and marital status]) and PCUSTe-2 (PCUSTe with lifestyle matching [smoking and drinking]). The pseudocode for the PCUSTe method is provided in the <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-6-3"><title>Synthetic Oversampling Methods</title><p>To complement the undersampling strategy, we also implemented synthetic oversampling using 3 established methods. SMOTE generates new samples of the minority class by interpolating between existing cases. ADASYN extends this approach by focusing more on generating samples near harder-to-classify instances. Additionally, we used a hybrid technique that combines SMOTE with edited nearest neighbors (ENN), which removes misclassified samples following oversampling to sharpen class boundaries and improve data quality.</p><p>We further optimized SMOTE, SMOTE+ ENN, and ADASYN by varying the k-nearest neighbors (k-NN) parameter as a function of the training set size. Specifically, k was defined as a proportion of the number of cases in the training data, and 6 values were evaluated: 2% (2/109), 5% (5/109), 10% (10/109), 30% (32/109), 50% (54/109), and 80% (87/109) of the case count. For reference, setting k to 5% of n resulted in 5 neighbors, which is the default setting for all 3 methods.</p></sec></sec><sec id="s2-7"><title>ML Modeling</title><sec id="s2-7-1"><title>Algorithm Selection</title><p>We implemented a range of ML algorithms selected for their relevance to cancer risk prediction. LR served as the baseline model due to its simplicity, high interpretability, and computational efficiency, and was implemented with regularization to improve generalizability [<xref ref-type="bibr" rid="ref37">37</xref>]. To explore the difference between batch and incremental learning approaches, we also applied stochastic gradient descent (SGD), which represents an incremental version of LR suitable for streaming or large-scale data applications [<xref ref-type="bibr" rid="ref38">38</xref>]. In addition to linear models, we used random forest (RF), a decision-tree-based ensemble algorithm known for its robustness and ability to model nonlinear relationships [<xref ref-type="bibr" rid="ref39">39</xref>].</p><p>To assess the benefits of more sophisticated models, we implemented extreme gradient boosting (XGBoost), which is widely recognized for its performance in structured data and its ability to handle class imbalance effectively [<xref ref-type="bibr" rid="ref40">40</xref>]. Both batch and incremental training modes were explored for XGBoost. Support vector machines (SVM) were included as well, given their efficacy in handling high-dimensional datasets and their resilience to overfitting in small, imbalanced datasets [<xref ref-type="bibr" rid="ref41">41</xref>].</p><p>To maximize the utility of the PCUSTe, we implemented an incremental learning variant using the SGD classifier. This approach involved 1000 training iterations, where each iteration generated a unique undersampled training set maintaining the specified covariate matching constraints. The iterative strategy was designed to mitigate the primary drawback of undersampling&#x2014;the loss of information&#x2014;by ensuring the model was exposed to more of the control population over time. For reproducibility, the random seed for each PCUSTe generation was updated by incrementing a base seed by the current iteration index. While the case samples remained constant across all iterations, the control subsets varied, allowing for partial overlap and a more robust representation of the original control distribution than a single-pass undersampling would allow. This high number of iterations was selected to ensure that the vast majority of the control population was used in the optimization process.</p></sec><sec id="s2-7-2"><title>Predictor Selection</title><p>To ensure methodological consistency across resampling strategies, and because SMOTE-based approaches generate synthetic observations via interpolation in continuous feature space, predictor selection was restricted to continuous variables. Recursive feature elimination was applied using each target model&#x2019;s own baseline estimator as the base learner, rather than a shared estimator across models.</p><p>Predictor selection was conducted exclusively within the training data to prevent information leakage and was integrated into the cross-validation pipeline together with hyperparameter optimization. To evaluate predictor relevance and stability across configurations, feature importance was further examined using Shapley Additive Explanations (SHAP) analysis [<xref ref-type="bibr" rid="ref42">42</xref>].</p></sec><sec id="s2-7-3"><title>Hyperparameter Optimization</title><p>Hyperparameter optimization was performed using RandomizedSearchCV (RS) exclusively on training data. Two performance metrics were used as optimization objectives: area under the receiver operating characteristic curve (AUC) and Matthews correlation coefficient (MCC). This resulted in 3 parameter configurations per model: None (default model settings), RS-AUC, and RS-MCC. The dual-objective optimization strategy allowed us to evaluate both probabilistic discrimination (AUC) and balanced classification performance (MCC), which is particularly relevant in imbalanced settings.</p><p>For SGD models, hyperparameter tuning was constrained to preserve incremental training properties: L2 regularization was fixed a priori, tolerance was disabled, and the number of iterations was set to one. Consequently, RS was applied only to selected learning-rate and regularization strength parameters (eta0 and alpha).</p><p>Tuned model development was implemented within a unified k-fold cross-validation pipeline. In each iteration, k&#x2013;1 folds were used for training and hold-out fold for validation. Resampling was applied exclusively to the training folds, followed by recursive feature elimination for feature selection and RS for hyperparameter optimization. Performance was evaluated on the held-out validation fold. The independent test set was not involved in any stage of model development.</p></sec><sec id="s2-7-4"><title>Performance Evaluation</title><p>To comprehensively assess model performance, evaluation metrics specifically suited for imbalanced data were prioritized [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Sensitivity, AUC, MCC, and specificity served as primary metrics to capture class-wise discrimination and model separability. Secondary metrics, including positive predictive value, negative predictive value, Brier score, accuracy, and overall weighted <italic>F</italic><sub>1</sub>-scores, were also estimated to provide a complete view of precision and calibration. Performance metrics were computed with 95% CIs estimated using 1000 bootstrap iterations.</p><p>Because resampling alters the class prior and can distort predicted probabilities, models trained on resampled data underwent prior probability correction to restore calibration relative to the original class distribution [<xref ref-type="bibr" rid="ref45">45</xref>]. Decision-based performance metrics (sensitivity, specificity, and MCC) were then computed using a classification threshold of 0.02, corresponding to the observed disease incidence in the training cohort. This threshold reflects a deployment-aligned operating point, where individuals with predicted risk exceeding the baseline population incidence would be considered high risk. Thresholds were defined to reflect epidemiologically plausible deployment scenarios, where the decision cutoff may be aligned with a priori expected disease incidence rather than determined by maximizing performance metrics within the training data. Sensitivity analyses were conducted across thresholds ranging from 0.01 to 0.05 to assess robustness under plausible real-world incidence scenarios.</p><p>After model development, all configurations (crude and tuned) were evaluated once on the reserved independent test set without further modification. The test set served as a common hold-out benchmark for comparing predefined algorithm&#x2013;resampling&#x2013;tuning configurations. For descriptive comparison, one configuration per ML algorithm was additionally evaluated against a null incidence-based model defined as a classifier that predicts the minority class for all observations. Under this specification, the baseline represents the upper bound for recall and the lower bound for precision given the observed outcome incidence.</p><p>To differentiate between generalization, learning, and memorization, model performance was compared across the original imbalanced test set, the original imbalanced training set, and the resampled training datasets. This framework enabled direct assessment of the models&#x2019; ability to distinguish classes while facilitating detection of potential overfitting to synthetic noise or oversampling artifacts.</p></sec></sec><sec id="s2-8"><title>Data Distribution Analysis</title><p>To assess how resampling techniques influenced data distribution, principal component analysis (PCA) was used for 2D projection and visual inspection of structural shifts. PCA reduces high-dimensional data into orthogonal components that capture the greatest proportion of variance, enabling visual comparison of the structural characteristics of the original and resampled datasets [<xref ref-type="bibr" rid="ref46">46</xref>].</p><p>To quantify overall separation between outcome groups, the mean Euclidean distance between cases and controls was calculated for each dataset. Pairwise Euclidean distances were computed between all case and control observations in the standardized feature space, and the average of these distances was used as a summary measure of inter-group separation.</p><p>In addition, predictor-level associations with the outcome were examined by calculating correlations between each individual predictor and the binary outcome variable. Correlation coefficients were computed independently for each dataset.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Study Participant Characteristics</title><p>The study identified significant demographic, clinical, and dietary differences between cases and controls. GI cancer cases were more likely to be male, older, smokers, and to have a lower monthly income (below 2 million Korean won) compared with controls, while no significant differences were observed for alcohol consumption, marital status, employment, or education. Clinically, cases showed significantly higher BMI, systolic blood pressure (SBP), diastolic blood pressure (DBP), fasting blood glucose, aspartate aminotransferase, and gamma-glutamyl transferase, along with lower high-density lipoprotein cholesterol. Dietary intake differed modestly, with cases reporting significantly lower intakes of sugar, dietary fiber, niacin, potassium, and vitamin C, whereas total energy intake and most other nutrients did not differ significantly between groups. Detailed statistics of the whole study population are presented in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Baseline characteristics of the analyzed cohort.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Case (n=156)</td><td align="left" valign="bottom">Control (n=7496)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Sex, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">89 (57.1)</td><td align="left" valign="top">2690 (35.9)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">67 (42.9)</td><td align="left" valign="top">4806 (64.1)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Alcohol consumer, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.68</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">55 (35.3)</td><td align="left" valign="top">2790 (37.2)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">101 (64.7)</td><td align="left" valign="top">4706 (62.8)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Smoker, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">79 (50.6)</td><td align="left" valign="top">5003 (66.7)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">77 (49.4)</td><td align="left" valign="top">2493 (33.3)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Married or cohabitating, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">19 (12.2)</td><td align="left" valign="top">901 (12.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">137 (87.8)</td><td align="left" valign="top">6595 (88.0)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Unemployed, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.77</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">92 (59.0)</td><td align="left" valign="top">4307 (57.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">64 (41.0)</td><td align="left" valign="top">3189 (42.5)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Higher education<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.18</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">92 (59.0)</td><td align="left" valign="top">3994 (53.3)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">64 (41.0)</td><td align="left" valign="top">3502 (46.7)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Lower income<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">108 (69.2)</td><td align="left" valign="top">6009 (80.2)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">48 (30.8)</td><td align="left" valign="top">1487 (19.8)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Age (years), mean (SD)</td><td align="left" valign="top">57.51 (7.83)</td><td align="left" valign="top">52.33 (8.20)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">AST<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (U/L), mean (SD)</td><td align="left" valign="top">28.58 (15.06)</td><td align="left" valign="top">22.97 (11.11)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">BMI (kg/m<sup>2</sup>), mean (SD)</td><td align="left" valign="top">24.97 (2.98)</td><td align="left" valign="top">23.59 (2.97)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Calcium (mg/d), mean (SD)</td><td align="left" valign="top">474.73 (199.97)</td><td align="left" valign="top">486.88 (211.73)</td><td align="left" valign="top">.45</td></tr><tr><td align="left" valign="top">Carbohydrate (g/d), mean (SD)</td><td align="left" valign="top">315.48 (33.92)</td><td align="left" valign="top">314.08 (33.71)</td><td align="left" valign="top">.61</td></tr><tr><td align="left" valign="top">&#x03B2;-Carotene (&#x338D;/d), mean (SD)</td><td align="left" valign="top">1976.90 (1024.43)</td><td align="left" valign="top">2127.04 (1221.98)</td><td align="left" valign="top">.07</td></tr><tr><td align="left" valign="top">Cholesterol (mg/d), mean (SD)</td><td align="left" valign="top">121.09 (74.57)</td><td align="left" valign="top">131.70 (77.08)</td><td align="left" valign="top">.09</td></tr><tr><td align="left" valign="top">DBP<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> (mmHg), mean (SD)</td><td align="left" valign="top">81.81 (10.82)</td><td align="left" valign="top">76.03 (10.48)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Fasting blood glucose (mg/dL), mean (SD)</td><td align="left" valign="top">102.44 (33.66)</td><td align="left" valign="top">92.99 (14.18)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Fat (g/d), mean (SD)</td><td align="left" valign="top">29.18 (11.78)</td><td align="left" valign="top">30.46 (11.53)</td><td align="left" valign="top">.18</td></tr><tr><td align="left" valign="top">Fiber (g/d), mean (SD)</td><td align="left" valign="top">15.13 (6.61)</td><td align="left" valign="top">16.69 (7.47)</td><td align="left" valign="top">.004</td></tr><tr><td align="left" valign="top">GGT<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> (IU/L), mean (SD)</td><td align="left" valign="top">44.47 (68.35)</td><td align="left" valign="top">28.79 (33.74)</td><td align="left" valign="top">.005</td></tr><tr><td align="left" valign="top">HDL<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup> (mg/dL), mean (SD)</td><td align="left" valign="top">55.51 (13.01)</td><td align="left" valign="top">59.76 (14.59)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Iron (mg/d), mean (SD)</td><td align="left" valign="top">10.34 (2.60)</td><td align="left" valign="top">10.34 (2.88)</td><td align="left" valign="top">.97</td></tr><tr><td align="left" valign="top">LDL<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup> (mg/dL), mean (SD)</td><td align="left" valign="top">114.15 (36.13)</td><td align="left" valign="top">116.95 (32.64)</td><td align="left" valign="top">.34</td></tr><tr><td align="left" valign="top">Magnesium (mg/d), mean (SD)</td><td align="left" valign="top">168.12 (65.97)</td><td align="left" valign="top">177.43 (67.89)</td><td align="left" valign="top">.08</td></tr><tr><td align="left" valign="top">MUFA<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup> (g/d), mean (SD)</td><td align="left" valign="top">7.48 (3.74)</td><td align="left" valign="top">8.04 (3.93)</td><td align="left" valign="top">.06</td></tr><tr><td align="left" valign="top">Niacin (mg/d), mean (SD)</td><td align="left" valign="top">9.80 (2.42)</td><td align="left" valign="top">10.31 (2.59)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top">Phosphorus (mg/d), mean (SD)</td><td align="left" valign="top">874.99 (206.04)</td><td align="left" valign="top">899.72 (221.95)</td><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top">Potassium (mg/d), mean (SD)</td><td align="left" valign="top">2319.55 (727.02)</td><td align="left" valign="top">2460.60 (800.13)</td><td align="left" valign="top">.03</td></tr><tr><td align="left" valign="top">Protein (g/d), mean (SD)</td><td align="left" valign="top">61.88 (10.73)</td><td align="left" valign="top">62.49 (11.41)</td><td align="left" valign="top">.48</td></tr><tr><td align="left" valign="top">PUFA<sup><xref ref-type="table-fn" rid="table1fn9">i</xref></sup> (g/d), mean (SD)</td><td align="left" valign="top">4.60 (2.04)</td><td align="left" valign="top">4.84 (2.06)</td><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top">SBP<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup> (mmHg), mean (SD)</td><td align="left" valign="top">131.74 (15.07)</td><td align="left" valign="top">124.72 (14.46)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">SFA<sup><xref ref-type="table-fn" rid="table1fn11">k</xref></sup> (g/d), mean (SD)</td><td align="left" valign="top">8.45 (4.04)</td><td align="left" valign="top">9.04 (4.27)</td><td align="left" valign="top">.08</td></tr><tr><td align="left" valign="top">Sodium (mg/d), mean (SD)</td><td align="left" valign="top">1990.97 (814.69)</td><td align="left" valign="top">1981.94 (778.54)</td><td align="left" valign="top">.89</td></tr><tr><td align="left" valign="top">Sugar intake (g/d), mean (SD)</td><td align="left" valign="top">44.68 (25.30)</td><td align="left" valign="top">51.35 (28.39)</td><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top">Thiamin (mg/d), mean (SD)</td><td align="left" valign="top">0.90 (0.23)</td><td align="left" valign="top">0.92 (0.23)</td><td align="left" valign="top">.33</td></tr><tr><td align="left" valign="top">Energy (kcal/d), mean (SD)</td><td align="left" valign="top">1778.40 (597.48)</td><td align="left" valign="top">1732.69 (577.23)</td><td align="left" valign="top">.35</td></tr><tr><td align="left" valign="top">Triglyceride (mg/dL), mean (SD)</td><td align="left" valign="top">120.69 (65.40)</td><td align="left" valign="top">117.50 (75.34)</td><td align="left" valign="top">.55</td></tr><tr><td align="left" valign="top">Vitamin A (&#x338D; RE/d), mean (SD)</td><td align="left" valign="top">419.72 (201.31)</td><td align="left" valign="top">449.03 (224.47)</td><td align="left" valign="top">.07</td></tr><tr><td align="left" valign="top">Vitamin C (mg/d), mean (SD)</td><td align="left" valign="top">60.98 (34.46)</td><td align="left" valign="top">68.47 (39.26)</td><td align="left" valign="top">.008</td></tr><tr><td align="left" valign="top">Vitamin D (&#x338D;/d), mean (SD)</td><td align="left" valign="top">4.37 (4.19)</td><td align="left" valign="top">4.79 (4.07)</td><td align="left" valign="top">.21</td></tr><tr><td align="left" valign="top">Vitamin E (mg/d), mean (SD)</td><td align="left" valign="top">6.34 (3.32)</td><td align="left" valign="top">6.65 (3.11)</td><td align="left" valign="top">.22</td></tr><tr><td align="left" valign="top">Zinc (mg/d), mean (SD)</td><td align="left" valign="top">4.93 (1.77)</td><td align="left" valign="top">5.16 (1.88)</td><td align="left" valign="top">.14</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Higher education: college or above.</p></fn><fn id="table1fn2"><p><sup>b</sup>Lower income: below 2 million Korean won per month.</p></fn><fn id="table1fn3"><p><sup>c</sup>AST: aspartate aminotransferase.</p></fn><fn id="table1fn4"><p><sup>d</sup>DBP: diastolic blood pressure.</p></fn><fn id="table1fn5"><p><sup>e</sup>GGT: gamma-glutamyl transferase.</p></fn><fn id="table1fn6"><p><sup>f</sup>HDL: high-density lipoprotein.</p></fn><fn id="table1fn7"><p><sup>g</sup>LDL: low-density lipoprotein.</p></fn><fn id="table1fn8"><p><sup>h</sup>MUFA: monounsaturated fatty acids.</p></fn><fn id="table1fn9"><p><sup>i</sup>PUFA: polyunsaturated fatty acids.</p></fn><fn id="table1fn10"><p><sup>j</sup>SBP: systolic blood pressure.</p></fn><fn id="table1fn11"><p><sup>k</sup>SFA: saturated fatty acids.</p></fn></table-wrap-foot></table-wrap><p>A comparative analysis of baseline characteristics revealed no critical concerns regarding the representativeness of the data splits or the impact of participant exclusion (Tables S1-S4 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Characteristics of the training and test datasets were highly consistent, although minor discrepancies in statistical significance were observed for some nutritional variables, including total fat, zinc, thiamin and vitamin D, and fatty acids. However, these discrepancies were primarily driven by the differences in statistical power between the 2 partitions, as the absolute mean values and standard deviations remained stable across both sets.</p><p>Similarly, participants excluded due to missing data exhibited baseline characteristics generally consistent with those of the analyzed cohort. While the magnitude of statistical significance varied between these groups, the underlying clinical trends remained stable across all comparisons. For instance, although the association between drinking status and the outcome only reached statistical significance within the excluded cohort, cases were more likely to report alcohol consumption in both groups. Likewise, mean BMI was consistently higher among cases in both cohorts, though this difference only attained statistical significance in the larger analyzed cohort. Collectively, these findings suggest that no critical systematic differences exist between the analyzed, excluded, and partitioned groups, indicating that neither inclusion bias nor data-splitting artifacts materially affected the study results.</p></sec><sec id="s3-2"><title>Data Resampling Strategies</title><sec id="s3-2-1"><title>Patient-Centered Undersampling Method</title><p>PCUSTe enabled stronger performance for nonlinear models. However, resampling approaches were less effective for linear models. Notably, adjusting the threshold for model probabilities was sufficient to stabilize LR performance and achieve a 0.7 score for both sensitivity and specificity on crude, nonresampled data. Similarly, RF performed well while trained on crude training data, although results were improved after model tuning and PCUSTe resampling.</p></sec><sec id="s3-2-2"><title>Synthetic Oversampling Methods</title><p>Experiments evaluating k-NN parameter tuning for synthetic oversampling methods indicated that the default k-NN setting may not be optimal and that tuning this parameter can substantially influence model performance. In our study, higher k-NN values&#x2014;corresponding to approximately 10%&#x2010;80% of the minority class sample size&#x2014;often yielded better performance than the default setting. These findings suggest that, when applying synthetic oversampling to small and extremely imbalanced datasets, tuning the k-NN parameter should be explicitly considered. We present an example of model performance based on MCC metric for each ML algorithm trained on oversampled data with different k-NN in <xref ref-type="fig" rid="figure2">Figure 2</xref> (all models with default parameter settings).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>MCC scores of models evaluated on datasets oversampled using 3 different techniques: (A) SMOTE (left); (B) SMOTE+ ENN (middle); (C) ADASYN (right). Each subplot shows performance across varying values of the k parameter (number of nearest neighbors used for resampling): 2% (2/109), 5% (5/109), 10% (10/109), 30% (32/109), 50% (54/109), and 80% (87/109). Lines represent individual models, including logistic regression (LR), random forest (RF), stochastic gradient descent (SGD), support vector machine (SVM), and 2 extreme gradient boosting (XGBoost) variants. ADASYN: adaptive synthetic sampling; ENN: edited nearest neighbors; k-NN: k-nearest neighbors; LR: logistic regression; MCC: Matthews correlation coefficient; RF: random forest; SVM: support vector machine; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e78931_fig02.png"/></fig></sec></sec><sec id="s3-3"><title>ML Modeling</title><p>Tuned model hyperparameters, feature sets, and optimization metric scores for the selected configuration are presented in Tables S5 and S6 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><sec id="s3-3-1"><title>Predictor Selection</title><p>SHAP analyses revealed substantial algorithm-specific differences in dominant predictors, with linear and incrementally trained models emphasizing dietary exposures, whereas tree-based models prioritized clinical and physiological markers (Figure S1, in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Across all models, higher age consistently emerged as an important predictor of GI cancer risk; however, in the LR and SGD models, its relative influence was lower than that of dietary factors, including macronutrients, saturated fatty acids, and selected micronutrients. Elevated blood pressure was another consistently influential predictor across modeling approaches.</p><p>EPV sensitivity analysis results are presented in Figure S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. As the number of predictors decreased (resulting in higher EPV ratios), MCC scores generally tended to decline across most models, although the magnitude of change varied by algorithm. In this study, the number of predictors was predefined (34 for crude models and 20 for tuned models). These findings suggest that reducing the predictor set may lead to information loss and that model performance could potentially be improved through more advanced predictor selection approaches.</p></sec><sec id="s3-3-2"><title>Hyperparameter Optimization</title><p>RS yielded distinct optimal hyperparameter configurations depending on both the resampling strategy and the optimization metric. Based on inner cross-validation results within the training data, the highest AUC scores for LR, RF, and XGBoost models were achieved with PCUSTe-1 resampling, whereas SVM achieved the highest cross-validation AUC with SMOTE resampling. In contrast, MCC values were more consistent across resampling strategies, with most models achieving similar performance regardless of the sampling method used.</p><p>SVM models demonstrated metric-dependent kernel selection. Linear kernels were predominantly selected when optimizing for AUC, whereas radial basis function kernels were favored when optimizing for MCC in models trained on oversampled data. Under PCUSTe variants and hybrid resampling strategies, linear kernels were selected across both optimization metrics.</p></sec><sec id="s3-3-3"><title>Performance Evaluation</title><sec id="s3-3-3-1"><title>Overall Performance</title><p>Despite pronounced class imbalance, some models trained on crude (nonresampled) data achieved moderate discriminatory performance when decision thresholds were set to the empirical disease incidence in training data. Under this setting, linear classifiers exhibited relatively stable behavior. Notably, the LR model trained on the original (nonresampled) data achieved the best performance among all algorithms in the crude evaluation. Importantly, without threshold adjustment, this model predicted all test set samples as controls. This finding suggests that when a strong linear signal is present, resampling may offer limited benefit, particularly when balanced sensitivity-specificity trade-offs are prioritized over aggressive minority class detection.</p><p>Undersampling methods consistently improved sensitivity across all model types, aligning better with screening-oriented objectives and the purpose of the classification task of our study. In contrast, oversampling approaches generally increased specificity at the expense of sensitivity, indicating a tendency to favor the majority class. Nonlinear models benefited most from undersampling strategies, whereas the same models trained on crude or oversampled data exhibited performance patterns consistent with majority-class overfitting. Detailed results for crude model evaluations are provided in Table S7 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><p>Tuned model performance results are reported in Tables S8 and S9 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Performance differences across resampling strategies were marginal for linear models but more pronounced for nonlinear models. Across most comparisons, PCUSTe-based methods outperformed oversampling approaches.</p><p><xref ref-type="table" rid="table2">Table 2</xref> contrasts one model configuration for each ML algorithm with the null (incidence-only) model, which classifies all samples as belonging to the minority class. All results are reported for the unseen nonresampled test set, the nonresampled training set, and the resampled training set. The selected configurations were not modified after the initial evaluation on the test set, ensuring that the results reflect predefined configurations and enabling comparison of model performance across ML algorithms and potential overfitting patterns.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance results by machine learning algorithms on the original test and train, and resampled train datasets. Values represent point estimates with 95% CIs calculated by bootstrapping.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom" colspan="4">Point estimate (95% CI)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Sensitivity</td><td align="left" valign="top">Specificity</td><td align="left" valign="top">AUC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">MCC<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Null<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test set</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.50</td><td align="left" valign="top">0.00</td></tr><tr><td align="left" valign="top">LR<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test set</td><td align="left" valign="top">0.70 (0.57&#x2010;0.83)</td><td align="left" valign="top">0.71 (0.69&#x2010;0.72)</td><td align="left" valign="top">0.75 (0.68&#x2010;0.82)</td><td align="left" valign="top">0.13 (0.08&#x2010;0.17)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (original)</td><td align="left" valign="top">0.71 (0.62&#x2010;0.79)</td><td align="left" valign="top">0.70 (0.68&#x2010;0.71)</td><td align="left" valign="top">0.78 (0.73&#x2010;0.82)</td><td align="left" valign="top">0.12 (0.09&#x2010;0.15)</td></tr><tr><td align="left" valign="top">SGD<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test set</td><td align="left" valign="top">0.77 (0.64&#x2010;0.89)</td><td align="left" valign="top">0.65 (0.63&#x2010;0.67)</td><td align="left" valign="top">0.77 (0.70&#x2010;0.84)</td><td align="left" valign="top">0.12 (0.08&#x2010;0.16)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (original)</td><td align="left" valign="top">0.77 (0.69&#x2010;0.85)</td><td align="left" valign="top">0.63 (0.61&#x2010;0.64)</td><td align="left" valign="top">0.78 (0.73&#x2010;0.81)</td><td align="left" valign="top">0.12 (0.09&#x2010;0.14)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (resampled)</td><td align="left" valign="top">0.77 (0.69&#x2010;0.85)</td><td align="left" valign="top">0.61 (0.53&#x2010;0.70)</td><td align="left" valign="top">0.78 (0.72&#x2010;0.84)</td><td align="left" valign="top">0.39 (0.27&#x2010;0.50)</td></tr><tr><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test set</td><td align="left" valign="top">0.77 (0.65&#x2010;0.89)</td><td align="left" valign="top">0.62 (0.60&#x2010;0.64)</td><td align="left" valign="top">0.73 (0.65&#x2010;0.81)</td><td align="left" valign="top">0.11 (0.07&#x2010;0.15)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (original)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">0.62 (0.61&#x2010;0.63)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">0.18 (0.16&#x2010;0.20)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (resampled)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td></tr><tr><td align="left" valign="top">XGB<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test set</td><td align="left" valign="top">0.77 (0.65&#x2010;0.89)</td><td align="left" valign="top">0.60 (0.58&#x2010;0.62)</td><td align="left" valign="top">0.73 (0.66&#x2010;0.79)</td><td align="left" valign="top">0.11 (0.07&#x2010;0.14)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (original)</td><td align="left" valign="top">0.95 (0.91&#x2010;0.99)</td><td align="left" valign="top">0.59 (0.58&#x2010;0.60)</td><td align="left" valign="top">0.83 (0.81&#x2010;0.86)</td><td align="left" valign="top">0.16 (0.14&#x2010;0.17)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (resampled)</td><td align="left" valign="top">0.99 (0.98&#x2010;0.99)</td><td align="left" valign="top">0.82 (0.80&#x2010;0.83)</td><td align="left" valign="top">0.98 (0.97&#x2010;0.98)</td><td align="left" valign="top">0.84 (0.83&#x2010;0.86)</td></tr><tr><td align="left" valign="top">Inc. XGB<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test set</td><td align="left" valign="top">0.70 (0.57&#x2010;0.83)</td><td align="left" valign="top">0.62 (0.60&#x2010;0.64)</td><td align="left" valign="top">0.68 (0.61&#x2010;0.76)</td><td align="left" valign="top">0.09 (0.05&#x2010;0.13)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (original)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">0.62 (0.60&#x2010;0.63)</td><td align="left" valign="top">0.96 (0.95&#x2010;0.96)</td><td align="left" valign="top">0.18 (0.16&#x2010;0.19)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (resampled)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td><td align="left" valign="top">1.00 (1.00&#x2010;1.00)</td></tr><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test set</td><td align="left" valign="top">0.79 (0.67&#x2010;0.90)</td><td align="left" valign="top">0.60 (0.58&#x2010;0.62)</td><td align="left" valign="top">0.74 (0.67&#x2010;0.80)</td><td align="left" valign="top">0.11 (0.07&#x2010;0.15)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (original)</td><td align="left" valign="top">0.80 (0.72&#x2010;0.88)</td><td align="left" valign="top">0.59 (0.58&#x2010;0.60)</td><td align="left" valign="top">0.80 (0.75&#x2010;0.84)</td><td align="left" valign="top">0.11 (0.09&#x2010;0.14)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train set (resampled)</td><td align="left" valign="top">0.80 (0.72&#x2010;0.87)</td><td align="left" valign="top">0.80 (0.72&#x2010;0.87)</td><td align="left" valign="top">0.87 (0.83&#x2010;0.91)</td><td align="left" valign="top">0.60 (0.49&#x2010;0.69)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AUC: area under the receiver operating characteristic curve.</p></fn><fn id="table2fn2"><p><sup>b</sup>MCC: Matthews correlation coefficient.</p></fn><fn id="table2fn3"><p><sup>c</sup>Null: model that always predicts positive class (incidence-only).</p></fn><fn id="table2fn4"><p><sup>d</sup>LR: logistic regression trained on nonresampled training split, crude model.</p></fn><fn id="table2fn5"><p><sup>e</sup>SGD: stochastic gradient descent trained incrementally on varying data resampled by PCUSTe-1 method, crude model.</p></fn><fn id="table2fn6"><p><sup>f</sup>RF: random forest trained on data resampled by PCUSTe-2 method, crude model.</p></fn><fn id="table2fn7"><p><sup>g</sup>XGBoost: extreme gradient boosting trained on data resampled by SMOTE+ ENN (k=54) method, tuned via RS-AUC.</p></fn><fn id="table2fn8"><p><sup>h</sup>Inc. XGBoost: extreme gradient boosting trained on data resampled by PCUSTe-2 method, crude model.</p></fn><fn id="table2fn9"><p><sup>i</sup>SVM: support vector machine trained on data resampled by PCUSTe-2 method, crude model.</p></fn></table-wrap-foot></table-wrap><p>Although MCC values were numerically low (0.09&#x2010;0.18) for the test and original training datasets, this reflects the extreme class imbalance in these sets, where even a small number of false-positive predictions disproportionately reduces MCC. Because the resampled training set was balanced across classes, MCC values appear substantially higher in that dataset. To better contextualize predictive performance, MCC values can be compared with those of the null model (MCC=0). Based on this comparison, the developed models achieved improvements of up to 0.13 in MCC, which may represent meaningful discrimination given the very low disease incidence in the test set. Overall, all ML algorithms demonstrated reasonable performance under at least one configuration. Receiver operating characteristic&#x2013;AUC curves for selected model configurations are shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, and secondary performance metrics are provided in Table S10 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Compared with the null model, the developed models improved positive predictive value from 0.02 to up to 0.05 and achieved a high negative predictive value (up to 0.99), while maintaining a similar Brier score (0.02).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>ROC curves for the selected configuration of each ML algorithm. AUC values in the legend represent point estimates on the test set with 95% CIs calculated by bootstrapping. AUC: area under the receiver operating characteristic curve; LR: logistic regression; RF: random forest; ROC: receiver operating characteristic; SGD: stochastic gradient descent; SVM: support vector machine; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e78931_fig03.png"/></fig></sec><sec id="s3-3-3-2"><title>LR Model</title><p>During model evaluation, the LR model trained on resampled datasets demonstrated a modest increase in sensitivity compared with the model trained on the original nonresampled data, and notably degraded specificity. Thus, for ML algorithm-based comparison, we have retained the crude LR model with all predictors and default hyperparameters. This model demonstrated balanced performance, achieving a test sensitivity of 0.70 (95% CI 0.57&#x2010;0.83), specificity of 0.71 (95% CI 0.69&#x2010;0.72), and an AUC of 0.75 (95% CI 0.68&#x2010;0.82). Importantly, the model showed no evidence of overfitting, with performance metrics remaining consistent across data splits.</p></sec><sec id="s3-3-3-3"><title>SGD Model</title><p>SGD, particularly when trained using the incrementally implemented PCUSTe-1 strategy with no tuning, achieved the strongest overall performance among the evaluated models. The highest observed performance was sensitivity of 0.77 (95% CI 0.64&#x2010;0.89), specificity of 0.65 (95% CI 0.63&#x2010;0.67), AUC of 0.77 (95% CI 0.70&#x2010;0.84), and MCC of 0.12 (95% CI 0.08&#x2010;0.16). Performance on training sets was highly consistent, although slightly lower compared with the test set. This model misclassified 11 out of 47 cancer cases as controls. Among the false negatives, diagnoses included colorectal (6/11), gastric (3/11), liver (1/11), and esophageal cancer (1/11), indicating that misclassification was not confined to a single GI cancer subtype.</p></sec><sec id="s3-3-3-4"><title>RF Model</title><p>RF showed over-optimization to the resampled training distribution, with a perfect training AUC that decreased significantly on the test set 0.73 (95% CI 0.65&#x2010;0.81). Nevertheless, the model maintained a test sensitivity of 0.77 (95% CI 0.65&#x2010;0.89) and specificity of 0.62 (95% CI 0.60&#x2010;0.64) while trained on PCUSTe-2 dataset, suggesting moderate generalization despite evidence of overfitting. Notably, RF exhibited majority class overfitting when trained on all oversampling-based datasets. In contrast, no resampling or undersampling via PCUSTe resulted in a more balanced classification.</p></sec><sec id="s3-3-3-5"><title>XGBoost Model</title><p>Boosting models exhibited high sensitivity to hyperparameter tuning, especially when trained on oversampled data. The batch-trained XGBoost model demonstrated strong predictive capacity but exhibited substantial performance variability between training and test environments. Although it achieved a test sensitivity of 0.77 (95% CI 0.65&#x2010;0.89), specificity of 0.60 (95% CI 0.58&#x2010;0.62), AUC of 0.73 (95% CI 0.66&#x2010;0.79), and MCC of 0.11 (95% CI 0.07&#x2010;0.14), near-perfect performance on resampled training data (AUC 0.98) indicated pronounced overfitting. This effect was further amplified in the incremental XGBoost implementation, which showed slightly reduced test performance (sensitivity 0.70, specificity 0.62; AUC 0.68; MCC 0.09) alongside increased overfitting.</p></sec><sec id="s3-3-3-6"><title>SVM Model</title><p>SVM achieved the highest overall sensitivity of 0.79 (95% CI 0.67&#x2010;0.90). However, compared with the SGD model, this was accompanied by lower specificity (0.60; 95% CI 0.58&#x2010;0.62) and AUC (0.74; 95% CI 0.67&#x2010;0.80). Compared with boosting- and tree-based approaches, SVM exhibited less overfitting, maintaining stable discriminatory performance under the 2% incidence setting. Notably, we observed relatively stable SVM performance across all resampling methods.</p><p>Figure S3 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> illustrates the performance of models across decision thresholds between 0.01 and 0.05. As expected, increasing the decision threshold resulted in a decrease in sensitivity and a corresponding increase in specificity. LR, SGD, and boosting models exhibited stable and monotonic sensitivity-specificity trade-offs, with SGD demonstrating an optimal balance between 0.02 and 0.03.</p></sec></sec></sec><sec id="s3-4"><title>Data Distribution Analysis</title><p>The PCA projections provide a top-level visual comparison of how each resampling method altered the class proportions within the primary components of variance (<xref ref-type="fig" rid="figure4">Figure 4</xref>). In the original dataset, the minority class is concentrated within a specific region already occupied by a high density of controls. The oversampling techniques increased the visibility of the minority class by increasing point density within those existing regions. In contrast, the undersampling methods did not alter the minority class but reduced the majority class instead. Notably, SMOTE+ ENN aggressively reduced the control population&#x2014;resulting in a dataset that was no longer balanced (with cases becoming the majority). Across all methods, the 2D projections suggest that resampling primarily shifts the relative density of the classes rather than creating clear linear separation in the first 2 principal components.</p><p>The distance heatmap evaluates the mean absolute predictor-wise difference between cases and controls (Figure S4 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Both PCUSTe-1 and PCUSTe-2 markedly reduced distances for age and several additional biological variables, despite these factors not being explicitly used for proportional matching. Distances between fatty acids and some micronutrients were increased instead. This suggests that matching based on socioeconomic variables (PCUSTe-1) or lifestyle factors (PCUSTe-2) indirectly aligns unselected physiological characteristics between cases and controls. In contrast, SMOTE+ ENN increased overall distances for age, blood pressure, and multiple other predictors.</p><p>The comparative analysis across varying k-NN values demonstrated that SMOTE and ADASYN exhibit highly similar behavior, generally maintaining the lowest mean distances to the original dataset and the narrowest gaps between classes (<xref ref-type="fig" rid="figure5">Figure 5</xref>). In contrast, SMOTE+ ENN showed a distinct upward trend in both case-control and control-control distances as the k-NN parameter increases. This suggests that the ENN cleaning process becomes more aggressive at higher neighbor scales, significantly increasing data sparsity and class separability by removing ambiguous majority instances. The PCUSTe-1 undersampling method remained constant as it is k-independent. Notably, PCUSTe-1 maintained significantly larger mean distances for cases compared with oversampling methods&#x2014;this is expected, as oversampling generates synthetic samples in close proximity to original minority observations&#x2014;whereas undersampling preserves the sparse, natural distribution of the original cases.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Principal component analysis visualization of dataset distributions across various resampling strategies. To ensure a consistent baseline, the principal component analysis transformation was fitted on the original training data and applied across all subsets. Top row: original imbalanced distribution and popular oversampling methods. Bottom row: hybrid resampling via SMOTE+ edited nearest neighbors, and the proposed PCUSTe methods. Red markers represent gastrointestinal cancer cases, and green markers represent controls. Case and control counts for each method are indicated in the respective subplot legends. ADASYN: adaptive synthetic sampling; PCUSTe-1: PCUSTe with sociodemographic matching (education, employment, income, and marital status); PCUSTe-2: PCUSTe with lifestyle matching (smoking and drinking); SMOTE: synthetic minority oversampling.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e78931_fig04.png"/></fig><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Comparison of mean distances among different k-NN in oversampled datasets using SMOTE, ADASYN, and SMOTE+ ENN. PCUSTe-1 is included for comparison. (A) Mean distances between resampled and original datasets across k values (top-left); (B) Mean distances between cases and controls within each resampled dataset (top-right); (C) Mean distances between control samples in each resampled dataset (bottom-left); (D) Mean distances between case samples in resampled datasets (bottom-right). ADASYN: adaptive synthetic sampling; ENN: edited nearest neighbors; k-NN: k-nearest neighbors; PCUSTe-1: PCUSTe with sociodemographic matching (education, employment, income, and marital status); SMOTE: synthetic minority oversampling.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e78931_fig05.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our analysis highlights that extreme class imbalance materially affects ML performance in cohort-based GI cancer prediction. In our experiments, extensive oversampling&#x2014;even after k-NN parameter tuning&#x2014;resulted in overfitting in complex nonlinear classifiers (RF, XGBoost, and SVM), whereas linear classifiers (LR and SGD) were comparatively robust. We developed an undersampling method, PCUSTe, which achieved stable test set performance across multiple classifiers without relying on synthetic data. To mitigate the reduction in training data inherent to undersampling, we combined PCUSTe with an incrementally trained SGD model. Among all evaluated configurations, this approach yielded the greatest improvement in predictive performance. The model with the highest performance achieved an AUC of 0.77 (95% CI 0.70&#x2010;0.84) and sensitivity of 0.77 (95% CI 0.64&#x2010;0.89), while maintaining moderate specificity (0.65, 95% CI 0.63&#x2010;0.67). Importantly, our findings indicate that the application of resampling techniques to address extreme class imbalance should be tailored to specific study objectives. In this study, we prioritized sensitivity over specificity, consistent with the objective of identifying individuals at elevated GI cancer risk who may benefit from modification of dietary and early metabolic risk factors. Conversely, in settings where a high false-positive rate may impose a substantial burden on participants, threshold adjustment alone may be preferable to resampling-based approaches.</p></sec><sec id="s4-2"><title>Comparison to Previous Work</title><p>Cohort studies provide longitudinal insights into individual health trajectories prior to disease onset, offering valuable data for identifying early risk factors and informing preventive strategies. However, for diseases that are uncommon in the general population, class imbalance remains a persistent challenge. A US-based study using blood count data to predict GI cancers achieved moderate discrimination (area under the receiver operating characteristic curve [AUROC]=0.75) but a low <italic>F</italic><sub>1</sub>-score (0.03) when addressing imbalance solely via class weights [<xref ref-type="bibr" rid="ref47">47</xref>]. Similarly, although models such as XGBoost and SVM can outperform linear classifiers when combined with class weighting, they are not inherently robust to extreme imbalance [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>].</p><p>In our study, nonlinear models trained without resampling yielded low to moderate sensitivity, often improving sensitivity at the expense of specificity or vice versa. For example, the XGBoost model trained on nonresampled data achieved only 0.06 sensitivity and 0.98 specificity, whereas the SVM trained on the same data showed 0.64 sensitivity and 0.54 specificity. Performance for both algorithms improved substantially when models were trained on data resampled using the PCUSTe-2 method. For instance, the XGBoost model trained on this dataset achieved a sensitivity of 0.66 with a specificity of 0.61 prior to tuning and sensitivity of 0.73 with specificity of 0.63 after tuning. These findings suggest that, in settings of extreme imbalance, algorithm choice alone may be insufficient to achieve adequate minority-class detection.</p><p>Resampling has therefore emerged as one of the most commonly used strategies to mitigate class imbalance. For instance, oversampling is frequently applied in biomedical ML model training, and previous studies have reported improved cancer prediction using methods such as SMOTE and its variants [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. Although oversampling can help address moderate imbalance, the aggressive oversampling required to correct severe imbalance remains challenging, particularly when complex ML algorithms are used. For example, a Korean study using national health insurance data to predict gastric cancer in adults (65,657 cases and 10,450,292 controls) used the Random Oversampling Examples (ROSE) method to address imbalance [<xref ref-type="bibr" rid="ref52">52</xref>]. The authors reported the best performance with an LR model (AUROC 0.71) compared with decision tree and XGBoost classifiers. This aligns with our findings, where LR models trained on oversampled data consistently outperformed more complex classifiers trained on the same data.</p><p>When comparing performance across resampling strategies, complex models showed clear signs of overfitting, particularly on oversampled data. Overfitting in such datasets is well documented and remains an ongoing challenge despite numerous mitigation attempts [<xref ref-type="bibr" rid="ref53">53</xref>]. During oversampling with the default k-NN setting (k=5), synthetic minority instances are generated in very close proximity to the original samples. As shown in our data distribution analyses, the resulting datasets did not create a clear linear separation in the first 2 principal components but instead shifted the relative density of the classes. Larger k values produced sparser training datasets and reduced overfitting. Although oversampling methods are widely used in biomedical ML, many studies rely on default parameter settings. Our findings extend prior work by demonstrating that k-NN tuning substantially influences model performance, with higher k values potentially yielding more stable and accurate predictions.</p><p>In contrast, models trained on undersampled data may be less prone to overfitting but are limited to a reduced training size. By combining undersampling with an incremental SGD model, we attempted to address this limitation and expose models to a larger proportion of the original training dataset. Our results show that this approach improved sensitivity while maintaining moderate specificity. Compared with other evaluated models trained on undersampled data, this combined strategy produced better performance. However, due to the customized implementation, model tuning was difficult, and our tuning strategy did not improve test performance relative to the crude model, suggesting that more sophisticated tuning approaches may be required to further improve baseline performance.</p><p>When comparing performance across ML algorithms, our results showed a clear distinction between linear and more complex classifiers in terms of overfitting behavior. Prior studies have shown that RFs and boosted trees are particularly susceptible to overfitting in small or noisy datasets. For instance, a recent simulation study demonstrated that deep trees can form local probability &#x201C;peaks&#x201D; around training samples, inflating apparent discrimination while reducing generalizability [<xref ref-type="bibr" rid="ref54">54</xref>]. Interestingly, that study also found that RF models, despite overfitting the training data, could still achieve strong test performance. Consistent with these findings, although high training accuracy suggested overfitting, RF models trained on undersampled data maintained strong test-set performance in our study. When trained on oversampled data, RF models likely formed localized decision regions around dense clusters of synthetic minority samples, which may have led to much of the remaining feature space being classified as the majority class. Performance improved when a larger k parameter and ENN cleaning were applied, likely because increased inter-sample distances produced a more dispersed minority distribution and reduced this form of overfitting. In contrast, the SGD model exhibited benign underfitting, a form of regularization that limits training accuracy but enhances generalization [<xref ref-type="bibr" rid="ref55">55</xref>].</p><p>Several studies have raised concerns about data resampling approaches due to their potential to distort population structure and adversely affect model calibration [<xref ref-type="bibr" rid="ref56">56</xref>]. Decision threshold adjustment, therefore, emerges as an alternative strategy for addressing class imbalance. However, selecting an appropriate decision threshold remains challenging, and a variety of automated threshold-search methods have been proposed [<xref ref-type="bibr" rid="ref57">57</xref>]. Because this study focuses on strategies to mitigate class imbalance in GI cancer binary classification, we adopted the observed disease incidence as the decision threshold. Although the exact incidence in a target population cannot be known with certainty, estimates from epidemiologic studies are widely available and may be more appropriate for risk stratification in biomedical settings than thresholds derived from purely algorithmic optimization.</p><p>Despite achieving strong performance with decision threshold adjustment alone for selected models, PCUSTe-based training set resampling improved minority-class detection across most configurations, particularly among more complex learning algorithms. Both evaluated configurations&#x2014;PCUSTe-1 (matching on marital status, education, employment, and income) and PCUSTe-2 (matching on smoking and alcohol consumption)&#x2014;supported favorable predictive performance, consistent with established associations between socioeconomic, lifestyle factors, and cancer risk [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. Importantly, PCUSTe&#x2019;s flexible parameterization&#x2014;defined by its matching criteria and case-control ratio&#x2014;allows adaptation to different datasets and research contexts. We observed that linear models tended to perform better under PCUSTe-1 resampling, whereas more complex classifiers showed improved performance with PCUSTe-2. When considered alongside SHAP analyses, which indicated stronger contributions of dietary factors in linear models and greater influence of physiological signals in nonlinear models, this pattern may suggest that the 2 matching strategies emphasize distinct aspects of GI cancer risk. Collectively, these findings reinforce that no single ML algorithm or imbalance mitigation strategy is universally optimal. Instead, adaptable modeling frameworks capable of efficient retraining across populations may be better suited to support precision prevention in nutrition and cancer research [<xref ref-type="bibr" rid="ref60">60</xref>].</p><p>In ML, large sample sizes are often favored, and some studies address data sparsity by pooling multiple data sources. For example, a recent Danish study combined 3 national registries and full-text medical records to develop a multi-cancer prediction model comprising 6.7 million individuals. Although internal validation showed strong performance for GI cancers (AUCs: pancreatic 0.86, liver 0.90, colorectal 0.85, gastric 0.85, oesophageal 0.89), external validation using the UK Biobank yielded substantially lower AUCs (0.65&#x2010;0.74) [<xref ref-type="bibr" rid="ref61">61</xref>]. Supported by multiple large-scale grants, this study illustrates the substantial resources required for population-level modeling. In contrast, our study used a smaller, grouped GI cancer dataset and has not yet undergone external validation. While direct comparison is limited due to differences in sample size and feature composition, our models nonetheless demonstrated promising predictive performance. These findings suggest that careful resampling and study design may partially mitigate limitations inherent to small, highly imbalanced datasets, although they cannot substitute for large-scale population data.</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>This study has several notable strengths. First, we leveraged longitudinal cohort data, enabling a robust assessment of risk factors preceding disease onset, and implemented a rigorous evaluation framework with strict separation between training and test datasets. Second, we introduced PCUSTe, a patient-centered undersampling framework that flexibly parameterizes matching criteria, offering a customizable, epidemiologically grounded approach to addressing class imbalance in small biomedical datasets. Third, we conducted a comprehensive comparison of resampling strategies across multiple ML algorithms and hyperparameter configurations.</p><p>Several limitations should also be acknowledged. First, dietary intake data were self-reported, introducing potential recall bias, and the lack of repeated dietary assessments limited our ability to capture temporal changes in exposure. To partially mitigate this, analyses were restricted to baseline assessments collected prior to diagnosis using standardized instruments. Second, although internal validation was performed, external validation is required to assess generalizability. While this limitation was partially addressed through conservative model evaluation, validation in independent cohorts remains an important next step. Third, the relatively small number of GI cancer cases may have constrained model performance and limited extrapolation to broader populations. We attempted to mitigate this by using multiple resampling strategies and applying class-prior-based decision threshold adjustment; however, larger sample sizes are needed to confirm the stability of the observed results.</p><p>In addition, the limited number of positive cases precluded robust modeling of site-specific GI cancer subtypes, necessitating a binarized pooled outcome. Although the final models did not appear to systematically misclassify risk for any specific GI cancer site, the biological and clinical heterogeneity of these malignancies should be acknowledged. Future analyses incorporating multi-site classification will be needed once sufficient sample sizes become available. Finally, while PCUSTe provides a promising solution for smaller cohorts, it further reduces the control population, and models trained on limited samples are inherently less competitive than those developed using large-scale datasets. To mitigate this, we paired PCUSTe with an incrementally trained SGD model, allowing iterative learning from multiple control subsets constrained by case covariate distributions. Nevertheless, caution is warranted when extrapolating model performance to broader clinical settings.</p></sec><sec id="s4-4"><title>Future Work</title><p>While PCUSTe and incidence-based threshold adjustment offer a practical and adaptable framework for improving risk stratification in cohort-based cancer prediction studies, external validation in larger and more diverse cohorts is required to establish the generalizability and scalability of these methods.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study demonstrates that a class imbalance mitigation strategy is a critical determinant of ML model performance in GI cancer risk prediction. By incorporating epidemiologically grounded matching principles, our patient-centered undersampling framework, PCUSTe, consistently outperformed conventional oversampling and hybrid approaches across multiple model architectures. However, incidence-based decision threshold adjustment alone, paired with a baseline LR model, showed better balance between sensitivity and specificity metrics. These findings underscore the value of aligning ML workflows with real-world population characteristics to enhance minority class detection in small, highly imbalanced datasets typical of rare or low-incidence disease research. Furthermore, the choice of the best class imbalance mitigation strategy may not be uniform across different study objectives.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This study was supported by the &#x201C;International Cooperation &#x0026; Education Program (NCCRI&#x00B7;NCCI 52210-52211, 2025)&#x201D; and the &#x201C;Intramural grant (2510880)&#x201D; of the National Cancer Center, Korea.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>DB contributed to study planning, conceptualization, manuscript drafting, formal analysis, experimental execution, and visualization. JL contributed to manuscript revision, formal analysis, and data supervision. MG contributed to manuscript revision, formal analysis, and supervision of experiments. JK contributed to study planning, conceptualization, manuscript revision, overall study supervision, and correspondence. All authors read and approved the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ADASYN</term><def><p>adaptive synthetic sampling</p></def></def-item><def-item><term id="abb2">AUC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb3">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb4">DBP</term><def><p>diastolic blood pressure</p></def></def-item><def-item><term id="abb5">ENN</term><def><p>edited nearest neighbors</p></def></def-item><def-item><term id="abb6">EPV</term><def><p>event per variable</p></def></def-item><def-item><term id="abb7">GI</term><def><p>gastrointestinal</p></def></def-item><def-item><term id="abb8"><italic>ICD-10</italic></term><def><p><italic>International Statistical Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb9">k-NN</term><def><p>k-nearest neighbor</p></def></def-item><def-item><term id="abb10">KNCC</term><def><p>Korean National Cancer Center</p></def></def-item><def-item><term id="abb11">LR</term><def><p>logistic regression</p></def></def-item><def-item><term id="abb12">MCC</term><def><p>Matthews correlation coefficient</p></def></def-item><def-item><term id="abb13">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb14">PCA</term><def><p>principal component analysis</p></def></def-item><def-item><term id="abb15">PCUSTe</term><def><p>patient-centered undersampling technique</p></def></def-item><def-item><term id="abb16">PCUSTe-1</term><def><p>PCUSTe with sociodemographic matching (education, employment, income, and marital status)</p></def></def-item><def-item><term id="abb17">PCUSTe-2</term><def><p>PCUSTe with lifestyle matching (smoking and drinking)</p></def></def-item><def-item><term id="abb18">RF</term><def><p>random forest</p></def></def-item><def-item><term id="abb19">ROSE</term><def><p>Random Oversampling Examples</p></def></def-item><def-item><term id="abb20">RS</term><def><p>RandomizedSearchCV</p></def></def-item><def-item><term id="abb21">SBP</term><def><p>systolic blood pressure</p></def></def-item><def-item><term id="abb22">SGD</term><def><p>stochastic gradient descent</p></def></def-item><def-item><term id="abb23">SHAP</term><def><p>Shapley additive explanations</p></def></def-item><def-item><term id="abb24">SMOTE</term><def><p>synthetic minority over-sampling technique</p></def></def-item><def-item><term id="abb25">SVM</term><def><p>support vector machine</p></def></def-item><def-item><term id="abb26">XGBoost</term><def><p>extreme gradient boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arnold</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abnet</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Neale</surname><given-names>RE</given-names> </name><etal/></person-group><article-title>Global burden of 5 major types of gastrointestinal cancer</article-title><source>Gastroenterology</source><year>2020</year><month>07</month><volume>159</volume><issue>1</issue><fpage>335</fpage><lpage>349</lpage><pub-id pub-id-type="doi">10.1053/j.gastro.2020.02.068</pub-id><pub-id pub-id-type="medline">32247694</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Duan</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bai</surname><given-names>J</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Morgado-Diaz</surname><given-names>JA</given-names> </name></person-group><article-title>Colorectal cancer: an overview</article-title><source>Gastrointestinal Cancers</source><year>2022</year><publisher-name>Exon Publications</publisher-name><pub-id pub-id-type="medline">36077731</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Thuler</surname><given-names>LCS</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Morgado-Diaz</surname><given-names>JA</given-names> </name></person-group><article-title>The epidemiology of stomach cancer</article-title><source>Gastrointestinal Cancers</source><year>2022</year><publisher-name>Exon Publications</publisher-name><pub-id pub-id-type="medline">36343146</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bray</surname><given-names>F</given-names> </name><name name-style="western"><surname>Laversanne</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sung</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Global cancer statistics 2022: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title><source>CA Cancer J Clin</source><year>2024</year><volume>74</volume><issue>3</issue><fpage>229</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.3322/caac.21834</pub-id><pub-id pub-id-type="medline">38572751</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shin</surname><given-names>WS</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>F</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Updated epidemiology of gastric cancer in Asia: decreased incidence but still a big challenge</article-title><source>Cancers (Basel)</source><year>2023</year><month>05</month><day>6</day><volume>15</volume><issue>9</issue><fpage>2639</fpage><pub-id pub-id-type="doi">10.3390/cancers15092639</pub-id><pub-id pub-id-type="medline">37174105</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kratz</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Gray</surname><given-names>CB</given-names> </name><etal/></person-group><article-title>The epidemiology of biliary tract cancer and associated prevalence of MDM2 amplification: a targeted literature review</article-title><source>Target Oncol</source><year>2024</year><month>11</month><volume>19</volume><issue>6</issue><fpage>833</fpage><lpage>844</lpage><pub-id pub-id-type="doi">10.1007/s11523-024-01086-5</pub-id><pub-id pub-id-type="medline">39302603</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Burden of gastrointestinal tumors in Asian countries, 1990-2021: an analysis for the global burden of disease study 2021</article-title><source>Clin Epidemiol</source><year>2024</year><volume>16</volume><fpage>587</fpage><lpage>601</lpage><pub-id pub-id-type="doi">10.2147/CLEP.S472553</pub-id><pub-id pub-id-type="medline">39252850</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Morgan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Arnold</surname><given-names>M</given-names> </name><name name-style="western"><surname>Camargo</surname><given-names>MC</given-names> </name><etal/></person-group><article-title>The current and future incidence and mortality of gastric cancer in 185 countries, 2020-40: a population-based modelling study</article-title><source>EClinicalMedicine</source><year>2022</year><month>05</month><volume>47</volume><fpage>101404</fpage><pub-id pub-id-type="doi">10.1016/j.eclinm.2022.101404</pub-id><pub-id pub-id-type="medline">35497064</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Royston</surname><given-names>P</given-names> </name><name name-style="western"><surname>Vergouwe</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Grobbee</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name></person-group><article-title>Prognosis and prognostic research: what, why, and how?</article-title><source>BMJ</source><year>2009</year><month>02</month><day>23</day><volume>338</volume><issue>feb23 1</issue><fpage>b375</fpage><lpage>b375</lpage><pub-id pub-id-type="doi">10.1136/bmj.b375</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name></person-group><article-title>Prognostic models: a methodological framework and review of models for breast cancer</article-title><source>Cancer Invest</source><year>2009</year><month>03</month><volume>27</volume><issue>3</issue><fpage>235</fpage><lpage>243</lpage><pub-id pub-id-type="doi">10.1080/07357900802572110</pub-id><pub-id pub-id-type="medline">19291527</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Wolff</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Riley</surname><given-names>RD</given-names> </name><etal/></person-group><article-title>PROBAST: a tool to assess risk of bias and applicability of prediction model studies: explanation and elaboration</article-title><source>Ann Intern Med</source><year>2019</year><month>01</month><day>1</day><volume>170</volume><issue>1</issue><fpage>W1</fpage><lpage>W33</lpage><pub-id pub-id-type="doi">10.7326/M18-1377</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name></person-group><article-title>Construction of a random survival forest model based on a machine learning algorithm to predict early recurrence after hepatectomy for adult hepatocellular carcinoma</article-title><source>BMC Cancer</source><year>2024</year><volume>24</volume><issue>1</issue><fpage>1575</fpage><pub-id pub-id-type="doi">10.1186/s12885-024-13366-4</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tran</surname><given-names>TT</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gunathilake</surname><given-names>M</given-names> </name><etal/></person-group><article-title>A comparison of machine learning models and Cox proportional hazards models regarding their ability to predict the risk of gastrointestinal cancer based on metabolic syndrome and its components</article-title><source>Front Oncol</source><year>2023</year><volume>13</volume><fpage>1049787</fpage><pub-id pub-id-type="doi">10.3389/fonc.2023.1049787</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Park</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name></person-group><article-title>Comparison of survival prediction models for pancreatic cancer: cox model versus machine learning models</article-title><source>Genomics Inform</source><year>2022</year><month>06</month><volume>20</volume><issue>2</issue><fpage>e23</fpage><pub-id pub-id-type="doi">10.5808/gi.22036</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kang</surname><given-names>SU</given-names> </name><name name-style="western"><surname>Nam</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Kwon</surname><given-names>OB</given-names> </name><etal/></person-group><article-title>Predictive mortality and gastric cancer risk using clinical and socio-economic data: a nationwide multicenter cohort study</article-title><source>Cancers (Basel)</source><year>2024</year><month>12</month><day>25</day><volume>17</volume><issue>1</issue><fpage>30</fpage><pub-id pub-id-type="doi">10.3390/cancers17010030</pub-id><pub-id pub-id-type="medline">39796661</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miglietta</surname><given-names>F</given-names> </name><name name-style="western"><surname>Collesei</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vernieri</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Development of two machine learning models to predict conversion from primary HER2-0 breast cancer to HER2-low metastases: a proof-of-concept study</article-title><source>ESMO Open</source><year>2025</year><month>01</month><volume>10</volume><issue>1</issue><fpage>104087</fpage><pub-id pub-id-type="doi">10.1016/j.esmoop.2024.104087</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Menardi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Torelli</surname><given-names>N</given-names> </name></person-group><article-title>Training and assessing classification rules with imbalanced data</article-title><source>Data Min Knowl Disc</source><year>2014</year><month>01</month><volume>28</volume><issue>1</issue><fpage>92</fpage><lpage>122</lpage><pub-id pub-id-type="doi">10.1007/s10618-012-0295-5</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rahman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nahid</surname><given-names>N</given-names> </name><name name-style="western"><surname>Schuller</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ahad</surname><given-names>MAR</given-names> </name></person-group><article-title>A stacked CNN and random forest ensemble architecture for complex nursing activity recognition and nurse identification</article-title><source>Sci Rep</source><year>2024</year><volume>14</volume><issue>1</issue><fpage>31667</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-81228-x</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Triantafillidis</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Georgiou</surname><given-names>K</given-names> </name><name name-style="western"><surname>Konstadoulakis</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Papalois</surname><given-names>AE</given-names> </name></person-group><article-title>Early-onset gastrointestinal cancer: An epidemiological reality with great significance and implications</article-title><source>World J Gastrointest Oncol</source><year>2024</year><month>03</month><day>15</day><volume>16</volume><issue>3</issue><fpage>583</fpage><lpage>597</lpage><pub-id pub-id-type="doi">10.4251/wjgo.v16.i3.583</pub-id><pub-id pub-id-type="medline">38577465</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Poorolajal</surname><given-names>J</given-names> </name><name name-style="western"><surname>Moradi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mohammadi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cheraghi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Gohari-Ensaf</surname><given-names>F</given-names> </name></person-group><article-title>Risk factors for stomach cancer: a systematic review and meta-analysis</article-title><source>Epidemiol Health</source><year>2020</year><volume>42</volume><fpage>e2020004</fpage><pub-id pub-id-type="doi">10.4178/epih.e2020004</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ilic</surname><given-names>I</given-names> </name><name name-style="western"><surname>Zivanovic Macuzic</surname><given-names>I</given-names> </name><name name-style="western"><surname>Ravic-Nikolic</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ilic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Milicic</surname><given-names>V</given-names> </name></person-group><article-title>Global burden of esophageal cancer and its risk factors: a systematic analysis of the global burden of disease study 2019</article-title><source>Life</source><year>2024</year><volume>15</volume><issue>1</issue><fpage>24</fpage><pub-id pub-id-type="doi">10.3390/life15010024</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roshandel</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ghasemi-Kebria</surname><given-names>F</given-names> </name><name name-style="western"><surname>Malekzadeh</surname><given-names>R</given-names> </name></person-group><article-title>Colorectal cancer: epidemiology, risk factors, and prevention</article-title><source>Cancers (Basel)</source><year>2024</year><month>04</month><day>17</day><volume>16</volume><issue>8</issue><fpage>1530</fpage><pub-id pub-id-type="doi">10.3390/cancers16081530</pub-id><pub-id pub-id-type="medline">38672612</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>JX</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>CF</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>WB</given-names> </name><etal/></person-group><article-title>Pancreatic cancer: a review of epidemiology, trend, and risk factors</article-title><source>World J Gastroenterol</source><year>2021</year><month>07</month><day>21</day><volume>27</volume><issue>27</issue><fpage>4298</fpage><lpage>4321</lpage><pub-id pub-id-type="doi">10.3748/wjg.v27.i27.4298</pub-id><pub-id pub-id-type="medline">34366606</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The trends in death of primary liver cancer caused by specific etiologies worldwide: results from the global burden of disease study 2019 and implications for liver cancer management</article-title><source>BMC Cancer</source><year>2023</year><volume>23</volume><issue>1</issue><fpage>598</fpage><pub-id pub-id-type="doi">10.1186/s12885-023-11038-3</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moglia</surname><given-names>V</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>O</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>G</given-names> </name><name name-style="western"><surname>de Kamps</surname><given-names>M</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>L</given-names> </name></person-group><article-title>Artificial intelligence methods applied to longitudinal data from electronic health records for prediction of cancer: a scoping review</article-title><source>BMC Med Res Methodol</source><year>2025</year><month>01</month><day>28</day><volume>25</volume><issue>1</issue><fpage>24</fpage><pub-id pub-id-type="doi">10.1186/s12874-025-02473-w</pub-id><pub-id pub-id-type="medline">39875808</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chawla</surname><given-names>NV</given-names> </name><name name-style="western"><surname>Bowyer</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>LO</given-names> </name><name name-style="western"><surname>Kegelmeyer</surname><given-names>WP</given-names> </name></person-group><article-title>SMOTE: synthetic minority over-sampling technique</article-title><source>JAIR</source><year>2002</year><volume>16</volume><fpage>321</fpage><lpage>357</lpage><pub-id pub-id-type="doi">10.1613/jair.953</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name></person-group><article-title>Cancer screenee cohort study of the national cancer center in South Korea</article-title><source>Epidemiol Health</source><year>2014</year><volume>36</volume><fpage>e2014013</fpage><pub-id pub-id-type="doi">10.4178/epih/e2014013</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breu</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Patwardhan</surname><given-names>VR</given-names> </name><name name-style="western"><surname>Nayor</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A multicenter study into causes of severe acute liver injury</article-title><source>Clin Gastroenterol Hepatol</source><year>2019</year><month>05</month><volume>17</volume><issue>6</issue><fpage>1201</fpage><lpage>1203</lpage><pub-id pub-id-type="doi">10.1016/j.cgh.2018.08.016</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xing</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Han</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mei</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>L</given-names> </name></person-group><article-title>Characteristics of peripheral blood Gamma-glutamyl transferase in different liver diseases</article-title><source>Medicine (Baltimore)</source><year>2022</year><month>01</month><day>7</day><volume>101</volume><issue>1</issue><fpage>e28443</fpage><pub-id pub-id-type="doi">10.1097/MD.0000000000028443</pub-id><pub-id pub-id-type="medline">35029891</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Parhofer</surname><given-names>KG</given-names> </name><name name-style="western"><surname>Laufs</surname><given-names>U</given-names> </name></person-group><article-title>The diagnosis and treatment of hypertriglyceridemia</article-title><source>Dtsch Arztebl Int</source><year>2019</year><month>12</month><day>6</day><volume>116</volume><issue>49</issue><fpage>825</fpage><lpage>832</lpage><pub-id pub-id-type="doi">10.3238/arztebl.2019.0825</pub-id><pub-id pub-id-type="medline">31888796</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ha</surname><given-names>E</given-names> </name><name name-style="western"><surname>Kwon</surname><given-names>O</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name></person-group><article-title>Development of a semi-quantitative food frequency questionnaire for dietary intake of elementary school children: data from the Seventh Korea national health and nutrition examination survey</article-title><source>Nutr Res Pract</source><year>2023</year><volume>17</volume><issue>4</issue><fpage>747</fpage><pub-id pub-id-type="doi">10.4162/nrp.2023.17.4.747</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yun</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Shim</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Kweon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Oh</surname><given-names>K</given-names> </name></person-group><article-title>Development of a food frequency questionnaire for the Korea National health and nutrition examination survey: data from the fourth Korea national health and nutrition examination survey (KNHANES IV)</article-title><source>Korean J Nutr</source><year>2013</year><volume>46</volume><issue>2</issue><fpage>186</fpage><pub-id pub-id-type="doi">10.4163/kjn.2013.46.2.186</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Willett</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Howe</surname><given-names>GR</given-names> </name><name name-style="western"><surname>Kushi</surname><given-names>LH</given-names> </name></person-group><article-title>Adjustment for total energy intake in epidemiologic studies</article-title><source>Am J Clin Nutr</source><year>1997</year><month>04</month><volume>65</volume><issue>4</issue><fpage>1220S</fpage><lpage>1228S</lpage><pub-id pub-id-type="doi">10.1093/ajcn/65.4.1220S</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Friedewald</surname><given-names>WT</given-names> </name><name name-style="western"><surname>Levy</surname><given-names>RI</given-names> </name><name name-style="western"><surname>Fredrickson</surname><given-names>DS</given-names> </name></person-group><article-title>Estimation of the concentration of low-density lipoprotein cholesterol in plasma, without use of the preparative ultracentrifuge</article-title><source>Clin Chem</source><year>1972</year><month>06</month><volume>18</volume><issue>6</issue><fpage>499</fpage><lpage>502</lpage><pub-id pub-id-type="medline">4337382</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jankovic</surname><given-names>S</given-names> </name></person-group><article-title>Tests for comparison of two groups: student&#x2019;s T-test, Mann-Whitney U-test and chi-square test</article-title><source>Int J Biomed Healthc</source><year>2022</year><volume>10</volume><issue>2</issue><fpage>134</fpage><pub-id pub-id-type="doi">10.5455/ijbh.2022.10.134-136</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vittinghoff</surname><given-names>E</given-names> </name><name name-style="western"><surname>McCulloch</surname><given-names>CE</given-names> </name></person-group><article-title>Relaxing the rule of ten events per variable in logistic and Cox regression</article-title><source>Am J Epidemiol</source><year>2007</year><month>03</month><day>15</day><volume>165</volume><issue>6</issue><fpage>710</fpage><lpage>718</lpage><pub-id pub-id-type="doi">10.1093/aje/kwk052</pub-id><pub-id pub-id-type="medline">17182981</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>CYJ</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Ingersoll</surname><given-names>GM</given-names> </name></person-group><article-title>An introduction to logistic regression analysis and reporting</article-title><source>J Educ Res</source><year>2002</year><month>09</month><volume>96</volume><issue>1</issue><fpage>3</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1080/00220670209598786</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bottou</surname><given-names>L</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Montavon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Orr</surname><given-names>GB</given-names> </name><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>KR</given-names> </name></person-group><article-title>Stochastic gradient descent tricks</article-title><source>Neural Networks: Tricks of the Trade</source><year>2012</year><publisher-name>Springer</publisher-name><fpage>421</fpage><lpage>436</lpage><pub-id pub-id-type="doi">10.1007/978-3-642-35289-8_25</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Random forests</article-title><source>Mach Learn</source><year>2001</year><month>10</month><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><conf-loc>San Francisco, USA</conf-loc><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hearst</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Dumais</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Osuna</surname><given-names>E</given-names> </name><name name-style="western"><surname>Platt</surname><given-names>J</given-names> </name><name name-style="western"><surname>Scholkopf</surname><given-names>B</given-names> </name></person-group><article-title>Support vector machines</article-title><source>IEEE Intell Syst Their Appl</source><year>1998</year><volume>13</volume><issue>4</issue><fpage>18</fpage><lpage>28</lpage><pub-id pub-id-type="doi">10.1109/5254.708428</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lundberg</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SI</given-names> </name></person-group><article-title>A unified approach to interpreting model predictions</article-title><access-date>2026-05-13</access-date><conf-name>31st Conference on Neural Information Processing Systems (NIPS 2017)</conf-name><conf-date>Dec 4-9, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf">https://proceedings.neurips.cc/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chicco</surname><given-names>D</given-names> </name><name name-style="western"><surname>Jurman</surname><given-names>G</given-names> </name></person-group><article-title>The advantages of the Matthews correlation coefficient (MCC) over F1 score and accuracy in binary classification evaluation</article-title><source>BMC Genomics</source><year>2020</year><month>01</month><day>2</day><volume>21</volume><issue>1</issue><fpage>6</fpage><pub-id pub-id-type="doi">10.1186/s12864-019-6413-7</pub-id><pub-id pub-id-type="medline">31898477</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bekkar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Djemaa</surname><given-names>HK</given-names> </name><name name-style="western"><surname>Alitouche</surname><given-names>TA</given-names> </name></person-group><article-title>Evaluation measures for models assessment over imbalanced data sets</article-title><source>J Inf Eng Appl</source><year>2013</year><access-date>2026-05-13</access-date><volume>3</volume><issue>10</issue><fpage>10</fpage><lpage>24</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.aircconline.com/ijdkp/V3N4/3413ijdkp02.pdf">https://www.aircconline.com/ijdkp/V3N4/3413ijdkp02.pdf</ext-link></comment><pub-id pub-id-type="doi">10.5121/ijdkp.2013.3402</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pozzolo</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Caelen</surname><given-names>O</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Bontempi</surname><given-names>G</given-names> </name></person-group><article-title>Calibrating probability with undersampling for unbalanced classification</article-title><conf-name>2015 IEEE Symposium Series on Computational Intelligence (SSCI)</conf-name><conf-date>Dec 7-10, 2015</conf-date><conf-loc>Cape Town, South Africa</conf-loc><fpage>159</fpage><lpage>166</lpage><pub-id pub-id-type="doi">10.1109/SSCI.2015.33</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Everitt</surname><given-names>B</given-names> </name></person-group><article-title>Principal components analysis</article-title><source>Introduction to Applied Multivariate Analysis with R</source><year>2011</year><fpage>61</fpage><lpage>103</lpage><pub-id pub-id-type="doi">10.1007/978-1-4419-9650-3_3</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Read</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Saini</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Waljee</surname><given-names>AK</given-names> </name></person-group><article-title>Prediction of gastrointestinal tract cancers using longitudinal electronic health record data</article-title><source>Cancers (Basel)</source><year>2023</year><month>02</month><day>22</day><volume>15</volume><issue>5</issue><fpage>1399</fpage><pub-id pub-id-type="doi">10.3390/cancers15051399</pub-id><pub-id pub-id-type="medline">36900192</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>KM</given-names> </name></person-group><article-title>Support vector machines for unbalanced multicategory classification</article-title><source>Math Probl Eng</source><year>2015</year><volume>2015</volume><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1155/2015/294985</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>H</given-names> </name></person-group><article-title>Electricity theft detection base on extreme gradient boosting in AMI</article-title><source>IEEE Trans Instrum Meas</source><year>2021</year><volume>70</volume><fpage>1</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1109/TIM.2020.3048784</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Muraru</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Sim&#x00F3;</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Iantovics</surname><given-names>LB</given-names> </name></person-group><article-title>Cervical cancer prediction based on imbalanced data using machine learning algorithms with a variety of sampling methods</article-title><source>Appl Sci (Basel)</source><year>2024</year><volume>14</volume><issue>22</issue><fpage>10085</fpage><pub-id pub-id-type="doi">10.3390/app142210085</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alsmariy</surname><given-names>R</given-names> </name><name name-style="western"><surname>Healy</surname><given-names>G</given-names> </name><name name-style="western"><surname>Abdelhafez</surname><given-names>H</given-names> </name></person-group><article-title>Predicting cervical cancer using machine learning methods</article-title><source>IJACSA</source><year>2020</year><volume>11</volume><issue>7</issue><fpage>173</fpage><lpage>184</lpage><pub-id pub-id-type="doi">10.14569/IJACSA.2020.0110723</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Jun</surname><given-names>JK</given-names> </name><etal/></person-group><article-title>A machine learning risk prediction model for gastric cancer with Shapley additive explanations</article-title><source>Cancer Res Treat</source><year>2025</year><month>07</month><volume>57</volume><issue>3</issue><fpage>821</fpage><lpage>829</lpage><pub-id pub-id-type="doi">10.4143/crt.2024.843</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkhawaldeh</surname><given-names>IM</given-names> </name><name name-style="western"><surname>Albalkhi</surname><given-names>I</given-names> </name><name name-style="western"><surname>Naswhan</surname><given-names>AJ</given-names> </name></person-group><article-title>Challenges and limitations of synthetic minority oversampling techniques in machine learning</article-title><source>World J Methodol</source><year>2023</year><month>12</month><day>20</day><volume>13</volume><issue>5</issue><fpage>373</fpage><lpage>378</lpage><pub-id pub-id-type="doi">10.5662/wjm.v13.i5.373</pub-id><pub-id pub-id-type="medline">38229946</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barre&#x00F1;ada</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Timmerman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Boulesteix</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Van Calster</surname><given-names>B</given-names> </name></person-group><article-title>Understanding overfitting in random forest for probability estimation: a visualization and simulation study</article-title><source>Diagn Progn Res</source><year>2024</year><month>09</month><day>27</day><volume>8</volume><issue>1</issue><fpage>14</fpage><pub-id pub-id-type="doi">10.1186/s41512-024-00177-1</pub-id><pub-id pub-id-type="medline">39334348</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Koren</surname><given-names>T</given-names> </name><name name-style="western"><surname>Livni</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mansour</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sherman</surname><given-names>U</given-names> </name></person-group><article-title>Benign underfitting of stochastic gradient descent</article-title><year>2022</year><access-date>2026-05-13</access-date><conf-name>Advances in Neural Information Processing Systems 35</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><conf-loc>New Orleans, Louisiana, USA</conf-loc><fpage>19605</fpage><lpage>19617</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://www.proceedings.com/68431.html">http://www.proceedings.com/68431.html</ext-link></comment><pub-id pub-id-type="doi">10.52202/068431-1425</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van den Goorbergh</surname><given-names>R</given-names> </name><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names> </name><name name-style="western"><surname>Timmerman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Calster</surname><given-names>B</given-names> </name></person-group><article-title>The harm of class imbalance corrections for risk prediction models: illustration and simulation using logistic regression</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>08</month><day>16</day><volume>29</volume><issue>9</issue><fpage>1525</fpage><lpage>1534</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac093</pub-id><pub-id pub-id-type="medline">35686364</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Esposito</surname><given-names>C</given-names> </name><name name-style="western"><surname>Landrum</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Schneider</surname><given-names>N</given-names> </name><name name-style="western"><surname>Stiefl</surname><given-names>N</given-names> </name><name name-style="western"><surname>Riniker</surname><given-names>S</given-names> </name></person-group><article-title>GHOST: adjusting the decision threshold to handle imbalanced data in machine learning</article-title><source>J Chem Inf Model</source><year>2021</year><month>06</month><day>28</day><volume>61</volume><issue>6</issue><fpage>2623</fpage><lpage>2640</lpage><pub-id pub-id-type="doi">10.1021/acs.jcim.1c00160</pub-id><pub-id pub-id-type="medline">34100609</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biesbroek</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kneepkens</surname><given-names>MC</given-names> </name><name name-style="western"><surname>van den Berg</surname><given-names>SW</given-names> </name><etal/></person-group><article-title>Dietary patterns within educational groups and their association with CHD and stroke in the European prospective investigation into cancer and nutrition-Netherlands cohort</article-title><source>Br J Nutr</source><year>2018</year><month>04</month><volume>119</volume><issue>8</issue><fpage>949</fpage><lpage>956</lpage><pub-id pub-id-type="doi">10.1017/S0007114518000569</pub-id><pub-id pub-id-type="medline">29644959</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>da Costa</surname><given-names>GG</given-names> </name><name name-style="western"><surname>da Concei&#x00E7;&#x00E3;o Nepomuceno</surname><given-names>G</given-names> </name><name name-style="western"><surname>da Silva Pereira</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sim&#x00F5;es</surname><given-names>BFT</given-names> </name></person-group><article-title>Worldwide dietary patterns and their association with socioeconomic data: an ecological exploratory study</article-title><source>Global Health</source><year>2022</year><month>03</month><day>12</day><volume>18</volume><issue>1</issue><fpage>31</fpage><pub-id pub-id-type="doi">10.1186/s12992-022-00820-w</pub-id><pub-id pub-id-type="medline">35279165</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stultz</surname><given-names>CM</given-names> </name></person-group><article-title>Machine learning for risk prediction: does one size really fit all?</article-title><source>JACC Adv</source><year>2023</year><month>09</month><volume>2</volume><issue>7</issue><fpage>100552</fpage><pub-id pub-id-type="doi">10.1016/j.jacadv.2023.100552</pub-id><pub-id pub-id-type="medline">38939502</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Holm</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Gaurav</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Multi-cancer risk stratification based on national health data: a retrospective modelling and validation study</article-title><source>Lancet Digit Health</source><year>2024</year><month>06</month><volume>6</volume><issue>6</issue><fpage>e396</fpage><lpage>e406</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00062-1</pub-id><pub-id pub-id-type="medline">38789140</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Patient Centered Undersampling technique (PCUSTe) pseudocode.</p><media xlink:href="medinform_v14i1e78931_app1.pdf" xlink:title="PDF File, 244 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Supplementary tables and figures.</p><media xlink:href="medinform_v14i1e78931_app2.pdf" xlink:title="PDF File, 746 KB"/></supplementary-material></app-group></back></article>