<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e80156</article-id><article-id pub-id-type="doi">10.2196/80156</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Research on the Prediction of Coal Workers&#x2019; Pneumoconiosis Based on Easily Detectable Clinical Data: Machine Learning Model Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Haiquan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jia</surname><given-names>Jiaqi</given-names></name><degrees>ME</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shi</surname><given-names>Xu</given-names></name><degrees>MM</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dong</surname><given-names>Yudie</given-names></name><degrees>ME</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Songquan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cui</surname><given-names>Yuming</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hang</surname><given-names>Wenlu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Dekun</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>School of Chemical Engineering &#x0026; Technology, China University of Mining and Technology</institution><addr-line>Xuzhou</addr-line><addr-line>Jiangsu</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Respiratory and Critical Care Medicine, Second Affiliated Hospital of Xuzhou Medical University</institution><addr-line>32 Meijian Road</addr-line><addr-line>Xuzhou</addr-line><addr-line>Jiangsu</addr-line><country>China</country></aff><aff id="aff3"><institution>School of Mechatronic Engineering, Jiangsu Normal University</institution><addr-line>Xuzhou</addr-line><addr-line>Jiangsu</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>X C</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Liang</surname><given-names>Xiaolong</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Yang</surname><given-names>Ziyi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Wenlu Hang, PhD, Department of Respiratory and Critical Care Medicine, Second Affiliated Hospital of Xuzhou Medical University, 32 Meijian Road, Xuzhou, Jiangsu, China, 86 13813477830; <email>wenluhangkz@163.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>13</day><month>2</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e80156</elocation-id><history><date date-type="received"><day>05</day><month>07</month><year>2025</year></date><date date-type="rev-recd"><day>04</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>08</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Haiquan Li, Jiaqi Jia, Xu Shi, Yudie Dong, Songquan Wang, Yuming Cui, Wenlu Hang, Dekun Zhang. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 13.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e80156"/><abstract><sec><title>Background</title><p>Coal workers&#x2019; pneumoconiosis (CWP) is the most prevalent occupational disease that causes irreversible lung damage. Early prediction of CWP is the key to blocking the irreversible process of pulmonary fibrosis. The prediction of CWP based on imaging data and biomarker detection is constrained due to high cost and poor convenience.</p></sec><sec><title>Objective</title><p>The study aimed to use easily detectable clinical data to construct a prediction model for CWP through machine learning (ML) methods.</p></sec><sec sec-type="methods"><title>Methods</title><p>A prediction framework was established using a moderate-sized dataset and multidimensional clinical features, including occupational information, lung function parameters, and blood indicators. Six ML algorithms (light gradient boosting machine, random forest, extreme gradient boosting, categorical boosting, support vector machine, and logistic regression) were trained and evaluated using a stratified 5-fold cross-validation and a held-out test set. Hyperparameter optimization was performed using a unified Optuna-based strategy to ensure fair comparison across models. Model interpretability was assessed using Shapley Additive Explanation on top-performing models. In addition, an ablation analysis was conducted by retraining models after excluding job type to assess the independent predictive value of clinical biomarkers.</p></sec><sec sec-type="results"><title>Results</title><p>All 6 models achieved consistently high predictive performance, and the differences among the top-performing models were small on the test set. After Optuna-based optimization, light gradient boosting machine and categorical boosting achieved high test-set area under curve values (0.974 and 0.975, respectively), while extreme gradient boosting achieved the highest recall (0.926) and <italic>F</italic><sub>1</sub>-score (0.952). Compared with the baseline models, hyperparameter optimization resulted in only minor performance changes, indicating robust prediction under the current feature set and evaluation protocol. Shapley Additive Explanation analysis consistently identified age, forced expiratory volume/forced vital capacity, and platelet count as key contributors to CWP risk prediction. The ablation analysis further showed that model performance remained strong after removing job type, supporting the independent predictive value of clinical features beyond occupational history.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The research results have confirmed the potential of combining simple multidimensional features with ML algorithms for predicting CWP and provided new ideas for early diagnosis and intervention of patients with CWP.</p></sec></abstract><kwd-group><kwd>coal workers&#x2019; pneumoconiosis</kwd><kwd>disease prediction</kwd><kwd>machine learning</kwd><kwd>clinical data</kwd><kwd>job-type</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>As a traditional fossil energy source, coal has long held an important position in the global energy system. A large amount of respiratory coal dust can be generated during coal mining, processing, loading, and transportation and enter the human lungs through diffusion and sedimentation, inducing the occurrence of coal workers&#x2019; pneumoconiosis (CWP) [<xref ref-type="bibr" rid="ref1">1</xref>]. The pathogenesis of CWP is complex, characterized by strong concealment in the early stage, high mortality rate in the later stage, and poor quality of life for patients. Once diagnosed, the course of the disease is irreversible and there is currently no effective cure in clinical practice [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Early identification of CWP can delay the deterioration of the condition and prevent it from developing into progressive mass fibrosis or respiratory failure.</p><p>High kilovoltage X-ray examination is the gold standard for CWP diagnosis. To avoid the problem of overlapping and occlusion of tissue and organ images, computed tomography detection technology has also been used for CWP diagnosis [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, the imaging differences among early patients with CWP are not significant, and there are also issues such as high cost, high radiation risk, and convenient equipment use, which collectively constrain the early identification of patients with CWP. At present, the development of biomarker detection technology has significantly improved the clinical feasibility of CWP early screening [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. As a measurable biological indicator, biomarkers can objectively reflect the physiological and pathological status of the body, such as proteins, genes, and metabolites, etc. The research on the expression levels of biomarkers in the serum of patients with CWP is the most extensive, including transforming growth factor-&#x03B1; [<xref ref-type="bibr" rid="ref8">8</xref>], interleukin-8 [<xref ref-type="bibr" rid="ref9">9</xref>], noncoding RNA (such as microRNA) [<xref ref-type="bibr" rid="ref10">10</xref>], and common lipid metabolites such as phosphatidylethanolamines and free fatty acids [<xref ref-type="bibr" rid="ref11">11</xref>], all of which have been proven to have important guiding significance for early identification of CWP. On the other hand, the occurrence and development of lung diseases usually have an impact on changes in lung microbiota and respiratory flora. MicroRNA expression profiles [<xref ref-type="bibr" rid="ref12">12</xref>], surfactant-associated protein A and surfactant-associated protein D [<xref ref-type="bibr" rid="ref13">13</xref>] in bronchoalveolar lavage fluid, and transforming growth factor-&#x03B2;, interleukin-1&#x03B2;, and matrix metalloproteinase-9 in sputum [<xref ref-type="bibr" rid="ref14">14</xref>], as well as benzene and aldehydes in volatile organic compounds of exhaled breath [<xref ref-type="bibr" rid="ref15">15</xref>], are also commonly used for early identification of CWP. However, metabolic processes are regulated by multiple factors. The lower specificity and sensitivity reduce the reliability of early screening of CWP through a single biomarker. Meanwhile, the high cost of detecting specific biomarkers also limits the early identification of patients with CWP.</p><p>Previous studies have shown that blood routine examination, as an economical, efficient, and easy-to-operate screening method in clinical practice, has important guiding significance for early identification and risk assessment of diseases, especially in mining areas where medical resources are relatively scarce [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. CWP usually leads to lung infections or the occurrence of inflammatory diseases, which are often reflected in lung function, coagulation function, inflammatory markers, etc. This provides the possibility for early identification of patients with CWP [<xref ref-type="bibr" rid="ref18">18</xref>]. At present, there is a relative lack of research on CWP prediction based on routine clinical blood data. This study aims to develop a low-cost CWP early screening tool based on machine learning (ML) models. By establishing a 3D feature space of occupational exposure history, lung function parameters, and routine blood indicators, and combining 6 algorithms including light gradient boosting machine (LightGBM), random forest (RF), extreme gradient boosting (XGBoost), categorical boosting (CatBoost), support vector machine (SVM), and logistic regression (LR) for comparative analysis of predictive performance. In addition, an Optuna-based hyperparameter optimization strategy was applied to tune the models under a unified evaluation protocol. Finally, the Shapley Additive Explanation (SHAP) method was used to interpret model predictions and analyze the contributions of key parameters such as lung function indicators and blood indicators. The high-precision and interpretable prediction model constructed can provide theoretical basis for early screening of CWP.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study was approved by the Second Affiliated Hospital of Xuzhou Medical University ([2024] 082701). Due to the retrospective nature of the study and the use of deidentified data, the requirement for informed consent was waived by the institutional review board. To ensure privacy and confidentiality, all personal identifiers, such as names and national identification numbers, were removed and replaced with unique study IDs before data analysis. No financial compensation was provided to the participants as the data were extracted from routine clinical and physical examination records. Furthermore, we confirm that no identifiable information or images of individual participants are included in this manuscript or its supplementary materials.</p></sec><sec id="s2-2"><title>Data Sources</title><p>Two hundred eighty-seven patients with CWP were admitted to a large tertiary hospital from June 28, 2022, to September 20, 2024. Dust-exposed workers undergoing annual occupational health examinations at the same hospital from 2022 to 2024 were considered as controls. Considering some workers attended examinations in multiple years, records were deduplicated using a unique personal identifier, and only the most recent examination record per worker was retained, yielding 2446 unique controls. These data were retrospectively extracted from the hospital&#x2019;s electronic medical records system and physical examination database. All participants were male, aged between 22 and 90 years, and were employees of a certain mining group. The testing report included common demographic information, job types, and routine biochemical indicators.</p><p>The clinical test data of dust-exposed workers and patients with CWP were intersected, and 17 indicators were selected as candidate features. These indicators included job type, age, forced expiratory volume/forced vital capacity (FEV1/FVC), white blood cell count (WBC), absolute neutrophil count (ANC), absolute lymphocyte count (ALC), absolute monocyte count (AMC), absolute eosinophil count (AEC), red blood cell count, hemoglobin, platelet count (PLT), alanine aminotransferase (ALT), glucose, triglycerides, cholesterol, high-density lipoprotein, and low-density lipoprotein. The raw dataset initially comprised 36 job types. However, when categorized by disease status, a highly significant class imbalance was observed that the number of healthy individuals exposed to dust was approximately 8 times that of patients with CWP. This severe imbalance phenomenon can cause the model to lean toward the majority class during training, thereby reducing its ability to recognize diseased samples and affecting the model&#x2019;s generalization performance [<xref ref-type="bibr" rid="ref19">19</xref>]. Therefore, the original 36 job titles were first merged into 11 broader job categories based on similarity in work environment and job tasks. And then examined the distribution of CWP cases across these categories and found substantial imbalance (eg, only 1 CWP case among vehicle drivers vs 218 cases among mixed excavation and coal mining workers). To avoid unstable estimates driven by rare categories and to ensure adequate case representation for modeling, we restricted the analytic cohort to 5 job categories with sufficient CWP case counts, including mixed excavation and coal mining workers, excavation workers, coal miners, winch operators, and conveyor operators. After this restriction, the final dataset included 1085 dust-exposed healthy individuals and 271 participants with CWP.</p></sec><sec id="s2-3"><title>Data Preprocessing</title><p>After verification, it was found that FEV1/FVC and low-density lipoprotein had missing values, accounting for 3.68% (50/1356) and 1.48% (20/1356), respectively. In order to avoid the impact of missing values on subsequent analysis and model training, the <italic>k</italic>-nearest neighbor (KNN) algorithm was used to fill in the missing value variables. The KNN imputation was performed within the training data for each fold in cross-validation, ensuring that the test data remained unseen during preprocessing. The specific calculation method is shown in equation (1).</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>d</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:msqrt><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:msqrt></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Among them, <italic>x<sub>i</sub></italic> and <italic>x<sub>j</sub>,</italic> respectively, represent the feature vectors of 2 samples, and <italic>x<sub>ik</sub></italic> and <italic>x<sub>jk</sub>,</italic> respectively, represent the <italic>k</italic>th feature of these 2 samples.</p><p>The categorical variable job type was processed using one-hot encoding, which converts each category into a binary feature column. Categories include mixed excavation and coal mining, excavation workers, winch operators, conveyor operators, and coal miners. The remaining 15 continuous feature variables were standardized, and each feature was transformed into a distribution with a mean 0 (SD 1) for the model to analyze. Standardization was performed only on the training data, with the same scaling applied to the validation and test sets. The calculation method for the standard score of each feature is shown in equation (2).</p><disp-formula id="E2"><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>Z</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03BC;</mml:mi></mml:mrow><mml:mi>&#x03C3;</mml:mi></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Among them, <italic>x</italic> is a certain value of the feature in the original data, <italic>&#x00B5;</italic> is the mean of the feature, and <italic>&#x03C3;</italic> is the SD of the feature.</p></sec><sec id="s2-4"><title>Statistical Analysis and Feature Selection</title><p>Statistical analysis was conducted using SPSS Statistics (version 26.0; IBM Corp). The normality of quantitative data was tested using the <italic>K</italic>-<italic>S</italic> test, and the homogeneity of variance was tested using the Levene test. Data with normal distribution were represented by mean (SD), and intergroup comparison was tested using 2 independent samples <italic>t</italic> test. The data with nonnormal distribution were represented by median (P25-P75), and Mann-Whitney <italic>U</italic> test was used for intergroup comparison. Categorical variables were represented by the number of examples (%), and comparison between groups was conducted using the chi-square test. The difference was statistically significant with <italic>P</italic>&#x003C;.05. The Least Absolute Shrinkage and Selection Operator (LASSO) algorithm, an embedded feature selection method, was used for regression analysis to identify key feature variables associated with CWP, thereby reducing model complexity and enhancing generalization capability. LASSO regression achieved feature selection by performing &#x03B1; regularization on coefficients, shrinking the coefficients of less important features to 0 [<xref ref-type="bibr" rid="ref20">20</xref>]. To ensure no information leakage, LASSO feature selection was performed within each fold&#x2019;s training data, and the same selected features were applied to the validation data within each fold. This method was combined with correlation analysis in filtering methods to comprehensively select features.</p></sec><sec id="s2-5"><title>Construction and Evaluation of ML Models</title><p>Six representative ML models including tree-based ensemble learning models (LightGBM, XGBoost, RF, and CatBoost) and traditional classification algorithms (LR and SVM) were used for constructing CWP prediction models. A brief overview of each model&#x2019;s key characteristics and its relevance to this study is provided below.</p><p>For tree-based ensemble learning models, XGBoost uses second-order Taylor expansion for high accuracy and speed, incorporates regularization to prevent overfitting, and supports parallel computing for efficient training [<xref ref-type="bibr" rid="ref21">21</xref>]. LightGBM uses a leaf-wise growth strategy and histogram-based feature discretization for efficiency, with built-in class weight adjustments beneficial for imbalanced datasets [<xref ref-type="bibr" rid="ref22">22</xref>]. CatBoost uses an ordered boosting strategy for better generalization, directly handles categorical features, and uses a symmetric tree structure to reduce overfitting [<xref ref-type="bibr" rid="ref23">23</xref>]. RF builds multiple decision trees from bootstrapped samples, randomly selects features at each split, and aggregates predictions through voting for robust classification [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>To compare model performance on imbalanced datasets, 2 traditional models were also selected for comparison with the ensemble models. LR as a generalized linear model predicts probabilities using a sigmoid function [<xref ref-type="bibr" rid="ref25">25</xref>]. SVM finds an optimal hyperplane to separate classes, using slack variables and a radial basis function kernel for inseparable data. Class weights were also incorporated into its objective function for imbalance handling [<xref ref-type="bibr" rid="ref26">26</xref>]. In this study, class weights were applied in the loss functions of both traditional models to effectively handle the class imbalance issue.</p><p>Python (version 3.8.0; Python Software Foundation) software was used for model training and evaluation, randomly dividing the dataset into training and test set in an 8:2 ratio. To further assess model stability, 5-fold stratified cross-validation within the training data was used. In this procedure, the dataset was randomly divided into 5 nonoverlapping subsets. In each fold, 4 subsets were used as the training set, and the remaining 1 subset was used for validation. This process was repeated 5 times, and the average performance across all folds was taken as the evaluation metric. The test set was only used for the final model evaluation, ensuring it remained unseen during model training and hyperparameter tuning. In the process of model training and evaluation, based on the confusion matrix, the performance of the model was comprehensively judged through accuracy, precision, recall, <italic>F</italic><sub>1</sub>-score value, and the area under curve (AUC) of the subjects. The corresponding calculation formulas are as follows:</p><disp-formula id="E3"><label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E5"><label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="E6"><label>(6)</label><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mn>2</mml:mn><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:msup><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msup><mml:mi>n</mml:mi><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:msup><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>+</mml:mo><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Among them, <italic>TP</italic> represents true positive, <italic>TN</italic> represents true negative, <italic>FP</italic> represents false positive, and <italic>FN</italic> represents false negative. <xref ref-type="fig" rid="figure1">Figure 1</xref> shows the technical roadmap of this study.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The technical roadmap of this study. AUC: area under the curve; CatBoost: categorical boosting; LightGBM: light gradient boosting machine; LR: logistic regression; RF: random forest; SHAP: Shapley Additive Explanation; SVM: support vector machine; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e80156_fig01.png"/></fig></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Results of Statistical Analysis and Feature Selection</title><p><xref ref-type="table" rid="table1">Table 1</xref> shows the comparison of basic characteristics between the dust-exposed workers and the patients with CWP. It is found that 13 indicators, including job type, age, FEV1/FVC, WBC, ANC, ALC, AMC, AEC, hemoglobin, PLT, ALT, cholesterol, and glucose, have statistical significance (<italic>P</italic>&#x003C;.05) compared between the 2 groups.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of basic characteristics between the dust-exposed workers and the patients with coal workers&#x2019; pneumoconiosis.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Healthy</td><td align="left" valign="bottom">Disease</td><td align="left" valign="bottom">Test</td><td align="left" valign="bottom">Statistic</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Job type/Case</td><td align="left" valign="top">1085</td><td align="left" valign="top">271</td><td align="left" valign="top">Pearson &#x03C7;<sup>2</sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Mixed workers for excavation and coal mining, n (%)</td><td align="left" valign="top">160 (14.7)</td><td align="left" valign="top">219 (80.8)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2003;&#x2014;</td></tr><tr><td align="left" valign="top">Excavator workers, n (%)</td><td align="left" valign="top">253 (23.3)</td><td align="left" valign="top">35 (12.9)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2003;&#x2014;</td></tr><tr><td align="left" valign="top">Winch operator, n (%)</td><td align="left" valign="top">260 (24)</td><td align="left" valign="top">3 (1.1)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2003;&#x2014;</td></tr><tr><td align="left" valign="top">Conveyor operator, n (%)</td><td align="left" valign="top">234 (21.6)</td><td align="left" valign="top">6 (2.2)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2003;&#x2014;</td></tr><tr><td align="left" valign="top">Coal miners, n (%)</td><td align="left" valign="top">178 (16.4)</td><td align="left" valign="top">8 (3)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2003;&#x2014;</td></tr><tr><td align="left" valign="top">Age, median (IQR)</td><td align="left" valign="top">48 (39-52)</td><td align="left" valign="top">69 (62-77)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=11,900, <italic>Z</italic>=&#x2212;23.43</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">FEV1/FVC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> (%), median (IQR)</td><td align="left" valign="top">90 (86-97)</td><td align="left" valign="top">75.86 (68-89)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=177,046, <italic>Z</italic>=12.02</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">WBC<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (&#x00D7;10<sup>9</sup> /L), median (IQR)</td><td align="left" valign="top">6.36 (5.42-7.60)</td><td align="left" valign="top">5.84 (4.92-6.96)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=177,670, <italic>Z</italic>=5.32</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">ANC<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> (&#x00D7;10<sup>9</sup> /L), median (IQR)</td><td align="left" valign="top">3.53 (2.86-4.39)</td><td align="left" valign="top">2.97 (2.21-3.98)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=185,504, <italic>Z</italic>=6.67</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">ALC<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> (&#x00D7;10<sup>9</sup> /L), median (IQR)</td><td align="left" valign="top">2.21 (1.82-2.69)</td><td align="left" valign="top">2.01 (1.58-2.72)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=163,138, <italic>Z</italic>=2.80</td><td align="left" valign="top">.005</td></tr><tr><td align="left" valign="top">AMC<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup> (&#x00D7;10<sup>9</sup> /L), median (IQR)</td><td align="left" valign="top">0.39 (0.33-0.48)</td><td align="left" valign="top">0.47 (0.38-0.68)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=94,124, <italic>Z</italic>=&#x2212;9.17</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">AEC<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup> (&#x00D7;10<sup>9</sup> /L), median (IQR)</td><td align="left" valign="top">0.16 (0.10-0.25)</td><td align="left" valign="top">0.13 (0.07-0.19)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=174,854, <italic>Z</italic>=4.83</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">RBC<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup> (&#x00D7;10<sup>12</sup> /L), median (IQR)</td><td align="left" valign="top">4.89 (4.64-5.12)</td><td align="left" valign="top">4.85 (4.46-5.33)</td><td align="left" valign="top">Mann&#x2013;Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=150,982, <italic>Z</italic>=0.69</td><td align="left" valign="top">.49</td></tr><tr><td align="left" valign="top">HB<sup><xref ref-type="table-fn" rid="table1fn9">i</xref></sup> (g/L), median (IQR)</td><td align="left" valign="top">151 (144-158)</td><td align="left" valign="top">148 (136-160)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=161,103, <italic>Z</italic>=2.44</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top">PLT<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup> (&#x00D7;10<sup>9</sup> /L), median (IQR)</td><td align="left" valign="top">244 (211-276)</td><td align="left" valign="top">200 (158.50-240)</td><td align="left" valign="top">Welch t</td><td align="left" valign="top"><italic>U</italic>=208,796, <italic>Z</italic>=10.71</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">ALT<sup><xref ref-type="table-fn" rid="table1fn11">k</xref></sup> (U/L), median (IQR)</td><td align="left" valign="top">20 (15-27)</td><td align="left" valign="top">17 (12-24)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=180,185, <italic>Z</italic>=5.75</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">GLU<sup><xref ref-type="table-fn" rid="table1fn12">l</xref></sup> (mmol/L), median (IQR)</td><td align="left" valign="top">5.25 (4.85-5.75)</td><td align="left" valign="top">4.91 (4.41-5.62)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=180,125, <italic>Z</italic>=6.14</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">TG<sup><xref ref-type="table-fn" rid="table1fn13">m</xref></sup> (mmol/L), median (IQR)</td><td align="left" valign="top">1.42 (0.99-2.33)</td><td align="left" valign="top">1.28 (0.96-2.01)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=157,897, <italic>Z</italic>=1.89</td><td align="left" valign="top">.06</td></tr><tr><td align="left" valign="top">CHOL<sup><xref ref-type="table-fn" rid="table1fn14">n</xref></sup> (mmol/L), median (IQR)</td><td align="left" valign="top">4.86 (4.29-5.52)</td><td align="left" valign="top">4.48 (3.67-5.22)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=181,528, <italic>Z</italic>=5.98</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">HDL<sup><xref ref-type="table-fn" rid="table1fn15">o</xref></sup> (mmol/L), median (IQR)</td><td align="left" valign="top">1.27 (1.12-1.47)</td><td align="left" valign="top">1.23 (1.05-1.54)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=150,479, <italic>Z</italic>=0.60</td><td align="left" valign="top">.55</td></tr><tr><td align="left" valign="top">LDL<sup><xref ref-type="table-fn" rid="table1fn16">p</xref></sup> (mmol/L), median (IQR)</td><td align="left" valign="top">2.67 (2.29-3.11)</td><td align="left" valign="top">2.69 (2.13-3.21)</td><td align="left" valign="top">Mann-Whitney <italic>U</italic></td><td align="left" valign="top"><italic>U</italic>=129,398, <italic>Z</italic>=0.67</td><td align="left" valign="top">.50</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not available.</p></fn><fn id="table1fn2"><p><sup>b</sup>FEV1/FVC: forced expiratory volume/forced vital capacity.</p></fn><fn id="table1fn3"><p><sup>c</sup>WBC: white blood cell.</p></fn><fn id="table1fn4"><p><sup>d</sup>ANC: absolute neutrophil count.</p></fn><fn id="table1fn5"><p><sup>e</sup>ALC: absolute lymphocyte count.</p></fn><fn id="table1fn6"><p><sup>f</sup>AMC: absolute monocyte count.</p></fn><fn id="table1fn7"><p><sup>g</sup>AEC: absolute eosinophil count.</p></fn><fn id="table1fn8"><p><sup>h</sup>RBC: red blood cell count.</p></fn><fn id="table1fn9"><p><sup>i</sup>HB: hemoglobin.</p></fn><fn id="table1fn10"><p><sup>j</sup>PLT: platelet count.</p></fn><fn id="table1fn11"><p><sup>k</sup>ALT: alanine aminotransferase.</p></fn><fn id="table1fn12"><p><sup>l</sup>GLU: glucose.</p></fn><fn id="table1fn13"><p><sup>m</sup>TG: triglycerides. </p></fn><fn id="table1fn14"><p><sup>n</sup>CHOL: cholesterol.</p></fn><fn id="table1fn15"><p><sup>o</sup>HDL: high-density lipoprotein.</p></fn><fn id="table1fn16"><p><sup>p</sup>LDL: low-density lipoprotein.</p></fn></table-wrap-foot></table-wrap><p>Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> showed the cross-validation curve of LASSO regression. When the &#x03B1; value is low, the model may contain too many irrelevant features, resulting in significant errors (overfitting). When the &#x03B1; value is large, the model may remove too many important features, which also leads to an increase in error (underfitting). At the optimal &#x03B1; value, the cross-validation error is minimized. The LASSO coefficient plot was shown in Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, which showed that the coefficients of 9 features, including job-type mixed excavation coal, job-type excavation worker, job-type conveyor operator, job-type winch operator, age, FEV1/FVC, AMC, PLT, and ANC, were not 0 at the optimal &#x03B1; value. This indicated the criticality of these features and their significant explanatory power for the target variable; therefore, they should be retained in the final model for prediction.</p><p>In order to avoid the problem of multicollinearity caused by strong correlation between features, this study used correlation analysis in a filtering method to comprehensively select features based on the 17 features selected through statistical analysis and LASSO regression screening in the early stage. By calculating the Spearman correlation coefficient matrix (<xref ref-type="fig" rid="figure2">Figure 2</xref>), the threshold was set to an absolute value of <italic>r</italic> greater than 0.8, and highly correlated terms in the feature pairs were selected. Based on the absolute value of LASSO regression weights, the features that contribute more to the target were retained, thereby eliminating redundant variables. Specifically, a Spearman correlation matrix was computed on the training data, and pairs with an absolute value of <italic>r</italic> greater than 0.8 were considered highly correlated. For each highly correlated pair, the feature with the larger absolute LASSO coefficient was retained and the other feature was removed. In the current dataset, WBC and ANC showed high correlation (<italic>r</italic>=0.833); thus, WBC was removed and ANC was retained. After redundancy filtering, 16 nonredundant features were used as inputs for subsequent model development, including job-type mixed excavation coal, job-type excavation worker, job-type conveyor operator, job-type winch operator, job-type coal miner, age, FEV1/FVC, ANC, ALC, AMC, AEC, hemoglobin, PLT, ALT, cholesterol, and glucose.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Spearman correlation coefficient matrix. AEC: absolute eosinophil count; ALC: absolute lymphocyte count; ALT: alanine aminotransferase; AMC: absolute monocyte count; ANC: absolute neutrophil count; CHOL: cholesterol; FEV1/FVC: forced expiratory volume/forced vital capacity; GLU: glucose; HB: hemoglobin; PLT: platelet count; WBC: white blood cell count.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e80156_fig02.png"/></fig></sec><sec id="s3-2"><title>Evaluation of the CWP Prediction Model</title><p>The selected 16 clinical features were input as feature variables into 6 ML models, and 5-fold cross-validation was performed for each model during training. The performance of each model was comprehensively evaluated based on the test-set data, and the output results of each model were organized and are summarized in <xref ref-type="table" rid="table2">Table 2</xref>. Meanwhile, the visualized results of the data in <xref ref-type="table" rid="table2">Table 2</xref> are shown in Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. From the figure, it can be seen that the AUC of each fold of the 6 models in cross-validation was consistently high (all folds &#x003E;0.90), indicating that the generalization ability of the 6 models is strong and stable. The AUC values of the 6 models were ranked in descending order as CatBoost (0.979), LightGBM (0.978), XGBoost (0.976), RF (0.972), SVM (0.968), and LR (0.967). This result indicated that the performance of ensemble learning models on imbalanced datasets was superior to traditional models, verifying the advantages of ensemble learning models in dealing with such problems [<xref ref-type="bibr" rid="ref27">27</xref>]. Taking into account accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score, both the LightGBM and RF models achieved the highest accuracy (0.982), precision (1), and <italic>F</italic><sub>1</sub>-score (0.951), while the LR model had the highest recall (0.926). Therefore, based on the overall performance across multiple evaluation indicators, LightGBM and RF were preliminarily considered as top-performing predictive models.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Evaluation of predictive performance of different models<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">LightGBM<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="char" char="." valign="top">0.982<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">1.000<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.951<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">0.978</td></tr><tr><td align="left" valign="top">CatBoost<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="char" char="." valign="top">0.978</td><td align="char" char="." valign="top">0.980</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.942</td><td align="char" char="." valign="top">0.979<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="char" char="." valign="top">0.978</td><td align="char" char="." valign="top">0.980</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.942</td><td align="char" char="." valign="top">0.976</td></tr><tr><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="char" char="." valign="top">0.982<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">1.000<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.951<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">0.972</td></tr><tr><td align="left" valign="top">LR<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="char" char="." valign="top">0.956</td><td align="char" char="." valign="top">0.862</td><td align="char" char="." valign="top">0.926<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="char" char="." valign="top">0.893</td><td align="char" char="." valign="top">0.967</td></tr><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="char" char="." valign="top">0.963</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.968</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table2fn2"><p><sup>b</sup> Significant values.</p></fn><fn id="table2fn3"><p><sup>c</sup>LightGBM: light gradient boosting machine.</p></fn><fn id="table2fn4"><p><sup>d</sup>CatBoost: categorical boosting.</p></fn><fn id="table2fn5"><p><sup>e</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table2fn6"><p><sup>f</sup>RF: random forest.</p></fn><fn id="table2fn7"><p><sup>g</sup>LR: logistic regression.</p></fn><fn id="table2fn8"><p><sup>h</sup>SVM: support vector machine.</p></fn></table-wrap-foot></table-wrap><p>In order to further improve the predictive performance and to ensure a fair comparison among candidate models, this study used the Optuna algorithm to optimize the hyperparameters of all 6 ML models (LightGBM, CatBoost, XGBoost, RF, LR, and SVM) under the same optimization budget, and also conducted 5-fold cross-validation during training. The output results of each optimized model were summarized in <xref ref-type="table" rid="table3">Table 3</xref>, and the visualized results were shown in Figure S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The results showed that the AUC of each fold in cross-validation was consistently high (all folds &#x003E;0.90), indicating that the generalization ability of the 6 models was strong and stable. After applying an equivalent hyperparameter optimization strategy, the overall performance of the 6 models remains high and the differences among the top-performing models were small. Specifically, CatBoost and LightGBM achieved high test-set AUC values (0.975 and 0.974, respectively). In addition, XGBoost achieved the highest recall (0.926) and <italic>F</italic><sub>1</sub>-score (0.952) on the test set. Compared with the baseline results, hyperparameter optimization led to only small changes in performance. Overall, the 6 models maintained consistently high performance under the current evaluation protocol.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance evaluation of optimized models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">LightGBM<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>-Optuna</td><td align="char" char="." valign="top">0.982</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.951</td><td align="char" char="." valign="top">0.974</td></tr><tr><td align="left" valign="top">CatBoost<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>-Optuna</td><td align="char" char="." valign="top">0.982</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.951</td><td align="char" char="." valign="top">0.975</td></tr><tr><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>-Optuna</td><td align="char" char="." valign="top">0.982</td><td align="char" char="." valign="top">0.98</td><td align="char" char="." valign="top">0.926</td><td align="char" char="." valign="top">0.952</td><td align="char" char="." valign="top">0.969</td></tr><tr><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup>-Optuna</td><td align="char" char="." valign="top">0.974</td><td align="char" char="." valign="top">0.961</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.933</td><td align="char" char="." valign="top">0.968</td></tr><tr><td align="left" valign="top">LR<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup>-Optuna</td><td align="char" char="." valign="top">0.952</td><td align="char" char="." valign="top">0.847</td><td align="char" char="." valign="top">0.926</td><td align="char" char="." valign="top">0.885</td><td align="char" char="." valign="top">0.968</td></tr><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup>-Optuna</td><td align="char" char="." valign="top">0.967</td><td align="char" char="." valign="top">0.925</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.916</td><td align="char" char="." valign="top">0.962</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>LightGBM: light gradient boosting machine.</p></fn><fn id="table3fn3"><p><sup>c</sup>CatBoost: categorical boosting.</p></fn><fn id="table3fn4"><p><sup>d</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table3fn5"><p><sup>e</sup>RF: random forest.</p></fn><fn id="table3fn6"><p><sup>f</sup>LR: logistic regression.</p></fn><fn id="table3fn7"><p><sup>g</sup>SVM: support vector machine.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Model Interpretability</title><p>In order to gain a deeper understanding of the impact of various clinical features on the model&#x2019;s prediction results, this study used the SHAP method to conduct interpretability analysis on the 2 representative top-performing models LightGBM-Optuna and CatBoost-Optuna. The calculation method was shown in Equation 7.</p><disp-formula id="E7"><label>(7)</label><mml:math id="eqn7"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>S</mml:mi><mml:mi>H</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi>S</mml:mi><mml:mi>H</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:mrow><mml:mi>S</mml:mi><mml:mi>H</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Among them, <italic>SHAP (base</italic>) is the baseline value of the entire model, and <italic>SHAP (x<sub>i</sub></italic>) is the contribution of each sample to the final prediction result.</p><p>The summary results of SHAP values are shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, which displayed the distribution of SHAP values for 17 input feature variables. Each point in the figure represented a feature, and the position of the point represented the SHAP value of the feature, which was the contribution of the feature to the model output. If the SHAP value is positive, it indicates that the feature increases the risk of disease and has a positive impact on the output results. Conversely, if it is negative, it indicates that the feature reduces the risk of disease and has a negative impact on the output results. In addition, the color range from blue to red reflects the actual value of the feature, with red indicating high values and blue indicating low values. The darker the color, the stronger the impact of the feature on the target variable. Overall, both models showed consistent patterns in feature effects. Age is the most influential variable, and higher age values were mainly associated with positive SHAP values, suggesting that older individuals tended to have a higher predicted disease risk. In contrast, higher values of FEV1/FVC were mostly distributed on the negative side, indicating that better lung function (higher FEV1/FVC) was related to a lower predicted risk. PLT showed a similar tendency, with higher values generally corresponding to negative SHAP values. On the other hand, higher AMC values tended to correspond to positive SHAP values, indicating a positive association with increased predicted risk. These results suggested that the model predictions were largely driven by age-related factors and lung function indicators, together with selected hematological and biochemical variables.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Summary chart of SHAP values. (a) LightGBM-Optuna; (b) CatBoost-Optuna. AEC: absolute eosinophil count; ALC: absolute lymphocyte count; ALT: alanine aminotransferase; AMC: absolute monocyte count; ANC: absolute neutrophil count; CHOL: cholesterol; FEV1/FVC: forced expiratory volume/forced vital capacity; GLU: glucose; HB: hemoglobin; PLT: platelet count; SHAP: Shapley Additive Explanations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e80156_fig03.png"/></fig><p><xref ref-type="fig" rid="figure4">Figure 4</xref> showed the SHAP feature importance matrix, which arranged the average SHAP absolute value of each feature from high to low. The horizontal axis represented the contribution value, and the larger the value, the greater the contribution to the model results. In both the Optuna-tuned LightGBM and CatBoost models, age showed the highest contribution, followed by FEV1/FVC and PLT, which indicates that these variables play the most important roles in the prediction of CWP risk.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Matrix diagram of SHAP feature importance. (a) LightGBM-Optuna; (b) CatBoost-Optuna. AEC: absolute eosinophil count; ALC: absolute lymphocyte count; ALT: alanine aminotransferase; AMC: absolute monocyte count; ANC: absolute neutrophil count; CHOL: cholesterol; FEV1/FVC: forced expiratory volume/forced vital capacity; GLU: glucose; HB: hemoglobin; PLT: platelet count; SHAP: Shapley Additive Explanation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e80156_fig04.png"/></fig><p>To further examine how the top 3 influential features affect the model output, SHAP dependence plots were generated (Figure S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). As shown in the dependence plot for age, the SHAP values generally increase with age, and the risk contribution became more pronounced after approximately 55&#x2010;60 years, suggesting that older age was associated with a higher predicted disease risk. For FEV1/FVC, lower values correspond to higher SHAP values, indicating an increased risk, whereas higher values (approximately 85%&#x2010;100%) were associated with SHAP values close to zero or negative, suggesting a lower predicted risk. Similarly, the PLT dependence plot showed that lower PLT levels tended to contribute positively to disease risk, while higher PLT values (approximately 250&#x2010;350&#x00D7;10<sup>9</sup>/L) were more often associated with negative SHAP values, indicating a reduced predicted risk.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In clinical research, it is very common for medical records to have missing values in a certain examination due to the complexity of data collection and individual differences among different patients. By calculating the Euclidean distance between samples, KNN interpolation can identify <italic>K</italic>-neighboring samples that are most similar to missing values and then use the average feature information of neighboring samples to fill in the missing values, effectively restoring the integrity of the data [<xref ref-type="bibr" rid="ref28">28</xref>]. As a categorical variable, the values of job type do not have a sequential relationship, and categories with larger values do not necessarily have greater weights than categories with smaller values. In order to enable ML models to better capture the relationship between feature variables and target variables, single hot encoding was applied to categorical features in this study [<xref ref-type="bibr" rid="ref29">29</xref>]. The remaining 15 characteristic variables are all continuous features, but their units and ranges of values vary greatly. For example, PLT is measured in unit &#x00D7;10^9/L and has a wide range of variable values, while glucose is measured in unit mmol/L and has a smaller range of values. This inconsistent scale may lead to the model being more sensitive to certain features with larger numerical ranges during training and ignoring other features with smaller scales, thereby affecting the training effectiveness of the model. Standardizing irregular continuous variables is often the key to solving such problems [<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>Feature selection is a crucial step in ML applications, aimed at selecting the most relevant features to the target variable in order to improve model performance and interpretability [<xref ref-type="bibr" rid="ref31">31</xref>]. The common feature selection methods mainly include embedded, wrapped, and filtered methods [<xref ref-type="bibr" rid="ref32">32</xref>]. As an embedded feature selection method, LASSO uses regression analysis to screen out key feature variables related to CWP, reducing model complexity and improving model generalization ability. In this study, LASSO regression analysis is applied to compress the coefficients of some unimportant features to 0 by performing &#x03B1; regularization on the coefficients, thereby achieving feature selection [<xref ref-type="bibr" rid="ref20">20</xref>]. The preprocessed dataset still suffers from class imbalance, with the number of dust-exposed workers without CWP being about 4 times that of patients with CWP. Regarding the issue of class imbalance, commonly used methods in model construction include data sampling and ensemble learning [<xref ref-type="bibr" rid="ref33">33</xref>]. In order to preserve the distribution characteristics of the original data as much as possible and avoid the bias and noise that may be introduced by data oversampling methods [<xref ref-type="bibr" rid="ref34">34</xref>], ensemble learning models were used to handle imbalanced data in this study. Specifically, we selected decision tree-based ensemble learning models such as LightGBM, RF, CatBoost, and XGBoost and compared them with traditional LR and SVM models.</p><p>To ensure a fair comparison among candidate models, this study adopted a unified hyperparameter optimization strategy based on Optuna. Optuna is a Bayesian optimization framework that uses a tree-structured Parzen estimator to efficiently explore the hyperparameter space by prioritizing promising regions. Under the same optimization budget and the same stratified <italic>k</italic>-fold cross-validation protocol, all 6 models were tuned and evaluated consistently. The results show that after optimization, all models achieved consistently high cross-validated performance, while the differences among the top-performing models remained small on the held-out test set. This finding suggests that the current feature set and evaluation setting already provide strong predictive ability, and further improvements are more likely to depend on feature refinement or decision strategy rather than extensive hyperparameter tuning. In addition, the top-performing models show comparable overall performance, but each presents advantages under different evaluation priorities. Specifically, models such as LightGBM and CatBoost demonstrate stronger overall discrimination, whereas XGBoost tends to perform better when recall- or <italic>F</italic><sub>1</sub>-related sensitivity is emphasized. Therefore, LightGBM and CatBoost were both retained as top-performing models for subsequent interpretability analysis. Job type reflects different dust exposure scenarios in coal mining and therefore contributes to CWP risk prediction, which is consistent with epidemiological evidence [<xref ref-type="bibr" rid="ref35">35</xref>]. The concentration, particle size, and composition of coal dust have a significant impact on the pathogenesis and prevalence of CWP [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. The different working scenarios in coal mines in the same region also have a significant impact on the prevalence of CWP. The excavator workers are mainly responsible for developing tunnels, and the cut rocks are rich in free silica. The pathogenicity of silica dust is much higher than that of coal dust, which can lead to more severe pulmonary fibrosis (silicosis) and a shorter onset period [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Coal miners mainly come into contact with coal dust (carbon-based dust), which has relatively weaker pathogenicity compared to silica dust and slower disease progression. At the same time, the excavation face is a temporary work site, and the ventilation and dust removal facilities are usually not as complete as those in the coal mining face, resulting in greater difficulty in dust control [<xref ref-type="bibr" rid="ref40">40</xref>]. Mixed workers for excavation and coal mining are exposed to silica dust and coal dust simultaneously, and the synergistic effect of the 2 types of dust may accelerate lung damage [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>].</p><p>To examine whether the clinical variables provide predictive value beyond occupational history, we conducted an ablation analysis by retraining the baseline models after removing the job-type variable. Because hyperparameter tuning led to only minor changes in performance, using baseline models for this analysis was sufficient to evaluate the independent contribution of clinical variables. The results of each model are summarized in <xref ref-type="table" rid="table4">Table 4</xref>, which showed that model performance remained highly robust. For example, the AUC of LightGBM only slightly changed from 0.978 (with job type) to 0.973 (without job type), indicating that the physiological signals captured by clinical features and biomarkers are major contributors to the model&#x2019;s predictive capability.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Ablation analysis of model performance without job type.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">LightGBM<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top">0.982</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.951</td><td align="char" char="." valign="top">0.973</td></tr><tr><td align="left" valign="top">CatBoost<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="char" char="." valign="top">0.978</td><td align="char" char="." valign="top">0.98</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.942</td><td align="char" char="." valign="top">0.977</td></tr><tr><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="char" char="." valign="top">0.982</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.951</td><td align="char" char="." valign="top">0.978</td></tr><tr><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="char" char="." valign="top">0.982</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0.907</td><td align="char" char="." valign="top">0.951</td><td align="char" char="." valign="top">0.976</td></tr><tr><td align="left" valign="top">LR<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="char" char="." valign="top">0.963</td><td align="char" char="." valign="top">0.893</td><td align="char" char="." valign="top">0.926</td><td align="char" char="." valign="top">0.909</td><td align="char" char="." valign="top">0.969</td></tr><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="char" char="." valign="top">0.974</td><td align="char" char="." valign="top">0.98</td><td align="char" char="." valign="top">0.889</td><td align="char" char="." valign="top">0.932</td><td align="char" char="." valign="top">0.971</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>AUC: area under curve.</p></fn><fn id="table4fn2"><p><sup>b</sup>LightGBM: light gradient boosting machine.</p></fn><fn id="table4fn3"><p><sup>c</sup>CatBoost: categorical boosting.</p></fn><fn id="table4fn4"><p><sup>d</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table4fn5"><p><sup>e</sup>RF: random forest.</p></fn><fn id="table4fn6"><p><sup>f</sup>LR: logistic regression.</p></fn><fn id="table4fn7"><p><sup>g</sup>SVM: support vector machine.</p></fn></table-wrap-foot></table-wrap><p>This finding is also supported by the SHAP interpretation results. The decrease in FEV1/FVC reflects impaired lung function, while the corresponding increase in SHAP value suggests that ventilation function may be an important feature of CWP. This is consistent with the results of existing studies suggesting that alveolar-arterial oxygen gradient in lung function can be used as a predictor of CWP [<xref ref-type="bibr" rid="ref43">43</xref>]. Platelets, as important cells for hemostasis and coagulation, play a role by participating in systemic inflammatory and immune responses, providing new therapeutic targets for inflammatory diseases [<xref ref-type="bibr" rid="ref44">44</xref>]. For example, a previous study found that lower PLT levels were associated with a higher risk of developing severe <italic>mycoplasma pneumoniae</italic> pneumonia [<xref ref-type="bibr" rid="ref45">45</xref>]. Due to the important role of platelets in inflammation and tissue repair, this phenomenon may be related to inflammation or weakened immune function leading to lung damage. These findings emphasize the importance of clinical features in CWP risk assessment and provide new perspectives for a deeper understanding of the pathogenesis of CWP.</p><p>Despite its contributions, this study has several limitations. First, the cohort was derived from a single center and a specific occupational group, which may introduce regional or selection bias and limit generalizability to other settings. Second, although model performance was evaluated rigorously, interpretability remains limited and warrants further investigation. Most importantly, smoking history was not available in the retrospective physical examination records. Because smoking is a major confounder for both lung function and inflammatory biomarkers, part of the observed discrimination may reflect unmeasured differences in smoking behavior rather than CWP status alone. Furthermore, differences in physical demands and lifestyle factors associated with distinct job roles could potentially influence certain biomarkers. While our analysis indicates strong independent predictive value for the biomarkers, future studies should consider more granular lifestyle adjustments.</p></sec><sec id="s4-2"><title>Conclusions</title><p>This study developed a ML-based model for CWP prediction using multidimensional clinical features. The 6 candidate models achieved consistently high performance, and Optuna-based tuning resulted in only small changes, suggesting robust prediction under the current protocol. SHAP analysis identified age, FEV1/FVC, and PLT as key contributors to CWP risk prediction. Moreover, ablation analysis showed that the models remained highly accurate even without job type, indicating that clinical biomarkers provide strong predictive signals beyond occupational information. These results support the potential of routine clinical data for early CWP screening and intervention.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This study was supported by the National Natural Science Foundation of China (82405130) and the Natural Science Foundation of Jiangsu Province (BK20220236).</p></sec><sec><title>Data Availability</title><p>The datasets analyzed during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>HL, YC, WH, and DZ were responsible for the conceptualization and design of the study. JJ, XS, and YD performed the data extraction and the data analysis. HL and WH provided clinical advice and recommendations on usability and clinical relevance. HL, JJ, and SW drafted the original manuscript. All authors critically reviewed and approved the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AEC</term><def><p>absolute eosinophil count</p></def></def-item><def-item><term id="abb2">ALC</term><def><p>absolute lymphocyte count</p></def></def-item><def-item><term id="abb3">ALT</term><def><p>alanine aminotransferase</p></def></def-item><def-item><term id="abb4">AMC</term><def><p>absolute monocyte count</p></def></def-item><def-item><term id="abb5">ANC</term><def><p>absolute neutrophil count</p></def></def-item><def-item><term id="abb6">AUC</term><def><p>area under curve</p></def></def-item><def-item><term id="abb7">CatBoost</term><def><p>categorical boosting</p></def></def-item><def-item><term id="abb8">CWP</term><def><p>coal workers&#x2019; pneumoconiosis</p></def></def-item><def-item><term id="abb9">FEV1/FVC</term><def><p>forced expiratory volume/forced vital capacity</p></def></def-item><def-item><term id="abb10">KNN</term><def><p>k-nearest neighbor</p></def></def-item><def-item><term id="abb11">LASSO</term><def><p>Least Absolute Shrinkage and Selection Operator</p></def></def-item><def-item><term id="abb12">LightGBM</term><def><p>light gradient boosting machine</p></def></def-item><def-item><term id="abb13">LR</term><def><p>logistic regression</p></def></def-item><def-item><term id="abb14">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb15">PLT</term><def><p>platelet count</p></def></def-item><def-item><term id="abb16">RF</term><def><p>random forest</p></def></def-item><def-item><term id="abb17">SHAP</term><def><p>Shapley Additive Explanation</p></def></def-item><def-item><term id="abb18">SVM</term><def><p>support vector machine</p></def></def-item><def-item><term id="abb19">WBC</term><def><p>white blood cell count</p></def></def-item><def-item><term id="abb20">XGBoost</term><def><p>extreme gradient boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vanka</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Shukla</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gomez</surname><given-names>HM</given-names> </name><etal/></person-group><article-title>Understanding the pathogenesis of occupational coal and silica dust-associated lung disease</article-title><source>Eur Respir Rev</source><year>2022</year><month>09</month><day>30</day><volume>31</volume><issue>165</issue><fpage>210250</fpage><pub-id pub-id-type="doi">10.1183/16000617.0250-2021</pub-id><pub-id pub-id-type="medline">35831008</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blackley</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>NB</given-names> </name><name name-style="western"><surname>Flattery</surname><given-names>J</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Cummings</surname><given-names>KJ</given-names> </name><name name-style="western"><surname>Laney</surname><given-names>AS</given-names> </name></person-group><article-title>Rise in lung transplants for coal workers&#x2019; pneumoconiosis and silicosis</article-title><source>Am J Respir Crit Care Med</source><year>2025</year><month>04</month><volume>211</volume><issue>4</issue><fpage>642</fpage><lpage>644</lpage><pub-id pub-id-type="doi">10.1164/rccm.202409-1767RL</pub-id><pub-id pub-id-type="medline">39805092</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Current status, trends, and predictions in the burden of coal worker&#x2019;s pneumoconiosis in 204 countries and territories from 1990 to 2019</article-title><source>Heliyon</source><year>2024</year><month>10</month><day>15</day><volume>10</volume><issue>19</issue><fpage>e37940</fpage><pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e37940</pub-id><pub-id pub-id-type="medline">39381106</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Akira</surname><given-names>M</given-names> </name><name name-style="western"><surname>Suganuma</surname><given-names>N</given-names> </name></person-group><article-title>Imaging diagnosis of pneumoconiosis with predominant nodular pattern: HRCT and pathologic findings</article-title><source>Clin Imaging</source><year>2023</year><month>05</month><volume>97</volume><fpage>28</fpage><lpage>33</lpage><pub-id pub-id-type="doi">10.1016/j.clinimag.2023.02.010</pub-id><pub-id pub-id-type="medline">36878176</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mandrioli</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schl&#x00FC;nssen</surname><given-names>V</given-names> </name><name name-style="western"><surname>&#x00C1;d&#x00E1;m</surname><given-names>B</given-names> </name><etal/></person-group><article-title>WHO/ILO work-related burden of disease and injury: protocol for systematic reviews of occupational exposure to dusts and/or fibres and of the effect of occupational exposure to dusts and/or fibres on pneumoconiosis</article-title><source>Environ Int</source><year>2018</year><month>10</month><volume>119</volume><fpage>174</fpage><lpage>185</lpage><pub-id pub-id-type="doi">10.1016/j.envint.2018.06.005</pub-id><pub-id pub-id-type="medline">29958118</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>T</given-names> </name></person-group><article-title>ERCC1 which affects lipids metabolism and actin dynamics in coal workers&#x2019; pneumoconiosis is a candidate biomarker for early warning and diagnosis</article-title><source>PLoS One</source><year>2024</year><volume>19</volume><issue>9</issue><fpage>e0308082</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0308082</pub-id><pub-id pub-id-type="medline">39283905</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xiong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Gut microbiome and metabolome profiling in coal workers&#x2019; pneumoconiosis: potential links to pulmonary function</article-title><source>Microbiol Spectr</source><year>2024</year><month>11</month><day>5</day><volume>12</volume><issue>11</issue><fpage>e0004924</fpage><pub-id pub-id-type="doi">10.1128/spectrum.00049-24</pub-id><pub-id pub-id-type="medline">39283109</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>KM</given-names> </name><etal/></person-group><article-title>Serum levels of TGF-&#x03B2;1 and MCP-1 as biomarkers for progressive coal workers&#x2019; pneumoconiosis in retired coal workers: a three-year follow-up study</article-title><source>Ind Health</source><year>2014</year><volume>52</volume><issue>2</issue><fpage>129</fpage><lpage>136</lpage><pub-id pub-id-type="doi">10.2486/indhealth.2013-0112</pub-id><pub-id pub-id-type="medline">24464026</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>BS</given-names> </name></person-group><article-title>Serum levels of IL-8 and ICAM-1 as biomarkers for progressive massive fibrosis in coal workers&#x2019; pneumoconiosis</article-title><source>J Korean Med Sci</source><year>2015</year><month>02</month><volume>30</volume><issue>2</issue><fpage>140</fpage><lpage>144</lpage><pub-id pub-id-type="doi">10.3346/jkms.2015.30.2.140</pub-id><pub-id pub-id-type="medline">25653483</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>J</given-names> </name></person-group><article-title>Upregulated has-miR-4516 as a potential biomarker for early diagnosis of dust-induced pulmonary fibrosis in patients with pneumoconiosis</article-title><source>Toxicol Res</source><year>2018</year><volume>7</volume><issue>3</issue><fpage>415</fpage><lpage>422</lpage><pub-id pub-id-type="doi">10.1039/C8TX00031J</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Lipidomics profiles and lipid metabolite biomarkers in serum of coal workers&#x2019; pneumoconiosis</article-title><source>Toxics</source><year>2022</year><month>08</month><day>26</day><volume>10</volume><issue>9</issue><fpage>496</fpage><pub-id pub-id-type="doi">10.3390/toxics10090496</pub-id><pub-id pub-id-type="medline">36136461</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Exosomal miRNAs contribute to coal dust particle-induced pulmonary fibrosis in rats</article-title><source>Ecotoxicol Environ Saf</source><year>2023</year><month>01</month><day>1</day><volume>249</volume><fpage>114454</fpage><pub-id pub-id-type="doi">10.1016/j.ecoenv.2022.114454</pub-id><pub-id pub-id-type="medline">38321673</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xing</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Expression levels of surfactant-associated proteins and inflammation cytokines in serum and bronchoalveolar lavage fluid among coal miners: a case-control study</article-title><source>J Occup Environ Med</source><year>2014</year><month>05</month><volume>56</volume><issue>5</issue><fpage>484</fpage><lpage>488</lpage><pub-id pub-id-type="doi">10.1097/JOM.0000000000000169</pub-id><pub-id pub-id-type="medline">24806560</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prince</surname><given-names>P</given-names> </name><name name-style="western"><surname>Boulay</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Pag&#x00E9;</surname><given-names>N</given-names> </name><name name-style="western"><surname>Desmeules</surname><given-names>M</given-names> </name><name name-style="western"><surname>Boulet</surname><given-names>LP</given-names> </name></person-group><article-title>Induced sputum markers of fibrosis and decline in pulmonary function in asbestosis and silicosis: a pilot study</article-title><source>Int J Tuberc Lung Dis</source><year>2008</year><month>07</month><volume>12</volume><issue>7</issue><fpage>813</fpage><lpage>819</lpage><pub-id pub-id-type="medline">18544209</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xuan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bunes</surname><given-names>BR</given-names> </name><name name-style="western"><surname>Crane</surname><given-names>N</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zang</surname><given-names>L</given-names> </name></person-group><article-title>Engineering solutions to breath tests based on an e-nose system for silicosis screening and early detection in miners</article-title><source>J Breath Res</source><year>2022</year><month>04</month><day>7</day><volume>16</volume><issue>3</issue><pub-id pub-id-type="doi">10.1088/1752-7163/ac5f13</pub-id><pub-id pub-id-type="medline">35303733</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shi</surname><given-names>H</given-names> </name><name name-style="western"><surname>You</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Evaluation of factors associated with adult skeletal fluorosis in coal-burning type of endemic fluorosis and initial screening model based on machine learning in Guizhou, Southwest China</article-title><source>Ecotoxicol Environ Saf</source><year>2025</year><month>03</month><day>15</day><volume>293</volume><fpage>118018</fpage><pub-id pub-id-type="doi">10.1016/j.ecoenv.2025.118018</pub-id><pub-id pub-id-type="medline">40073783</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qian</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>A cardiovascular disease prediction model based on routine physical examination indicators using machine learning methods: a cohort study</article-title><source>Front Cardiovasc Med</source><year>2022</year><volume>9</volume><fpage>854287</fpage><pub-id pub-id-type="doi">10.3389/fcvm.2022.854287</pub-id><pub-id pub-id-type="medline">35783868</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Bu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Research progress on the pathogenesis and prediction of pneumoconiosis among coal miners</article-title><source>Environ Geochem Health</source><year>2024</year><month>07</month><day>16</day><volume>46</volume><issue>9</issue><fpage>319</fpage><pub-id pub-id-type="doi">10.1007/s10653-024-02114-z</pub-id><pub-id pub-id-type="medline">39012521</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mrad</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Lahiani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mefteh-Wali</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mselmi</surname><given-names>N</given-names> </name></person-group><article-title>Correction: a comparative analysis of machine learning techniques for imbalanced data</article-title><source>Ann Oper Res</source><year>2024</year><month>10</month><volume>341</volume><issue>2-3</issue><fpage>1349</fpage><lpage>1349</lpage><pub-id pub-id-type="doi">10.1007/s10479-024-06079-1</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Niu</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>A novel nutrition-based nomogram to predict prognosis after curative resection of gastric cancer</article-title><source>Front Nutr</source><year>2021</year><volume>8</volume><fpage>664620</fpage><pub-id pub-id-type="doi">10.3389/fnut.2021.664620</pub-id><pub-id pub-id-type="medline">34760907</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bent&#x00E9;jac</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cs&#x00F6;rg&#x0151;</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez-Mu&#x00F1;oz</surname><given-names>G</given-names> </name></person-group><article-title>A comparative analysis of gradient boosting algorithms</article-title><source>Artif Intell Rev</source><year>2021</year><month>03</month><volume>54</volume><issue>3</issue><fpage>1937</fpage><lpage>1967</lpage><pub-id pub-id-type="doi">10.1007/s10462-020-09896-5</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hajihosseinlou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Maghsoudi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ghezelbash</surname><given-names>R</given-names> </name></person-group><article-title>A novel scheme for mapping of MVT-type Pb&#x2013;Zn prospectivity: LightGBM, a highly efficient gradient boosting decision tree machine learning algorithm</article-title><source>Nat Resour Res</source><year>2023</year><month>12</month><volume>32</volume><issue>6</issue><fpage>2417</fpage><lpage>2438</lpage><pub-id pub-id-type="doi">10.1007/s11053-023-10249-6</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hancock</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Khoshgoftaar</surname><given-names>TM</given-names> </name></person-group><article-title>CatBoost for big data: an interdisciplinary review</article-title><source>J Big Data</source><year>2020</year><volume>7</volume><issue>1</issue><fpage>94</fpage><pub-id pub-id-type="doi">10.1186/s40537-020-00369-8</pub-id><pub-id pub-id-type="medline">33169094</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names> </name></person-group><article-title>Random forests</article-title><source>Mach Learn</source><year>2001</year><month>10</month><volume>45</volume><issue>1</issue><fpage>5</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mirbagheri</surname><given-names>B</given-names> </name><name name-style="western"><surname>Alimohammadi</surname><given-names>A</given-names> </name></person-group><article-title>Improving urban cellular automata performance by integrating global and geographically weighted logistic regression models</article-title><source>Trans GIS</source><year>2017</year><month>12</month><volume>21</volume><issue>6</issue><fpage>1280</fpage><lpage>1297</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://onlinelibrary.wiley.com/toc/14679671/21/6">https://onlinelibrary.wiley.com/toc/14679671/21/6</ext-link></comment><pub-id pub-id-type="doi">10.1111/tgis.12278</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pourghasemi</surname><given-names>HR</given-names> </name><name name-style="western"><surname>Naghibi</surname><given-names>SA</given-names> </name></person-group><article-title>A comparative study of landslide susceptibility maps produced using support vector machine with different kernel functions and entropy data mining models in China</article-title><source>Bull Eng Geol Environ</source><year>2018</year><month>05</month><volume>77</volume><issue>2</issue><fpage>647</fpage><lpage>664</lpage><pub-id pub-id-type="doi">10.1007/s10064-017-1010-y</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rahmatinejad</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Dehghani</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hoseini</surname><given-names>B</given-names> </name><etal/></person-group><article-title>A comparative study of explainable ensemble learning and logistic regression for predicting in-hospital mortality in the emergency department</article-title><source>Sci Rep</source><year>2024</year><month>02</month><day>10</day><volume>14</volume><issue>1</issue><fpage>3406</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-54038-4</pub-id><pub-id pub-id-type="medline">38337000</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>JL</given-names> </name></person-group><article-title>SVM and KNN ensemble learning for traffic incident detection</article-title><source>Physica A Stat Mech Its Appl</source><year>2019</year><month>03</month><volume>517</volume><fpage>29</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1016/j.physa.2018.10.060</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Budholiya</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shrivastava</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>V</given-names> </name></person-group><article-title>An optimized XGBoost based diagnostic system for effective prediction of heart disease</article-title><source>J King Saud Univ Comput Inf Sci</source><year>2022</year><month>07</month><volume>34</volume><issue>7</issue><fpage>4514</fpage><lpage>4523</lpage><pub-id pub-id-type="doi">10.1016/j.jksuci.2020.10.013</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thorjussen</surname><given-names>CBH</given-names> </name><name name-style="western"><surname>Liland</surname><given-names>KH</given-names> </name><name name-style="western"><surname>M&#x00E5;ge</surname><given-names>I</given-names> </name><name name-style="western"><surname>Solberg</surname><given-names>LE</given-names> </name></person-group><article-title>Computational test for conditional independence</article-title><source>Algorithms</source><volume>17</volume><issue>8</issue><fpage>323</fpage><pub-id pub-id-type="doi">10.3390/a17080323</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khan</surname><given-names>F</given-names> </name><name name-style="western"><surname>Tarimer</surname><given-names>I</given-names> </name><name name-style="western"><surname>Alwageed</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>Effect of feature selection on the accuracy of music popularity classification using machine learning algorithms</article-title><source>Electronics (Basel)</source><year>2022</year><volume>11</volume><issue>21</issue><fpage>3518</fpage><pub-id pub-id-type="doi">10.3390/electronics11213518</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Senbagamalar</surname><given-names>L</given-names> </name><name name-style="western"><surname>Logeswari</surname><given-names>S</given-names> </name></person-group><article-title>Genetic clustering algorithm-based feature selection and divergent random forest for multiclass cancer classification using gene expression data</article-title><source>Int J Comput Intell Syst</source><year>2024</year><volume>17</volume><issue>1</issue><fpage>23</fpage><pub-id pub-id-type="doi">10.1007/s44196-024-00416-9</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yuan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name></person-group><article-title>A clustering-based adaptive undersampling ensemble method for highly unbalanced data classification</article-title><source>Appl Soft Comput</source><year>2024</year><month>07</month><volume>159</volume><fpage>111659</fpage><pub-id pub-id-type="doi">10.1016/j.asoc.2024.111659</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van den Goorbergh</surname><given-names>R</given-names> </name><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names> </name><name name-style="western"><surname>Timmerman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Calster</surname><given-names>B</given-names> </name></person-group><article-title>The harm of class imbalance corrections for risk prediction models: illustration and simulation using logistic regression</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>08</month><day>16</day><volume>29</volume><issue>9</issue><fpage>1525</fpage><lpage>1534</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac093</pub-id><pub-id pub-id-type="medline">35686364</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cool</surname><given-names>CD</given-names> </name><name name-style="western"><surname>Murray</surname><given-names>J</given-names> </name><name name-style="western"><surname>Vorajee</surname><given-names>NI</given-names> </name><etal/></person-group><article-title>Pathologic findings in severe coal workers&#x2019; pneumoconiosis in contemporary US coal miners</article-title><source>Arch Pathol Lab Med</source><year>2024</year><month>07</month><day>1</day><volume>148</volume><issue>7</issue><fpage>805</fpage><lpage>817</lpage><pub-id pub-id-type="doi">10.5858/arpa.2022-0491-OA</pub-id><pub-id pub-id-type="medline">37852172</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Study of dust deposition pattern in the respiratory tract of dust particles less than 10 &#x03BC;m in size</article-title><source>Powder Technol</source><year>2024</year><month>08</month><volume>444</volume><fpage>120033</fpage><pub-id pub-id-type="doi">10.1016/j.powtec.2024.120033</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kinsela</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Cen</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Impact of reactive iron in coal mine dust on oxidant generation and epithelial lung cell viability</article-title><source>Sci Total Environ</source><year>2022</year><month>03</month><day>1</day><volume>810</volume><fpage>152277</fpage><pub-id pub-id-type="doi">10.1016/j.scitotenv.2021.152277</pub-id><pub-id pub-id-type="medline">34902414</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Jia</surname><given-names>Q</given-names> </name></person-group><article-title>Silicosis: from pathogenesis to therapeutics</article-title><source>Front Pharmacol</source><year>2025</year><volume>16</volume><fpage>1516200</fpage><pub-id pub-id-type="doi">10.3389/fphar.2025.1516200</pub-id><pub-id pub-id-type="medline">39944632</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Keles</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sarver</surname><given-names>E</given-names> </name></person-group><article-title>A study of respirable silica in underground coal mines: particle characteristics</article-title><source>Minerals</source><year>2022</year><volume>12</volume><issue>12</issue><fpage>1555</fpage><pub-id pub-id-type="doi">10.3390/min12121555</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>H</given-names> </name><name name-style="western"><surname>Shuang</surname><given-names>H</given-names> </name></person-group><article-title>Respirable dust pollution characteristics within an underground heading face driven with continuous miner: a CFD modelling approach</article-title><source>J Clean Prod</source><year>2019</year><month>04</month><volume>217</volume><fpage>267</fpage><lpage>283</lpage><pub-id pub-id-type="doi">10.1016/j.jclepro.2019.01.273</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Coal dust particles can upregulate the expression of NLRP3 inflammasome components in rat alveolar macrophages through phagocytosis</article-title><source>Sci Rep</source><year>2025</year><month>03</month><day>15</day><volume>15</volume><issue>1</issue><fpage>8989</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-93946-x</pub-id><pub-id pub-id-type="medline">40089559</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kinsela</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Waite</surname><given-names>TD</given-names> </name></person-group><article-title>Elucidation of alveolar macrophage cell response to coal dusts: role of ferroptosis in pathogenesis of coal workers&#x2019; pneumoconiosis</article-title><source>Sci Total Environ</source><year>2022</year><month>06</month><day>1</day><volume>823</volume><fpage>153727</fpage><pub-id pub-id-type="doi">10.1016/j.scitotenv.2022.153727</pub-id><pub-id pub-id-type="medline">35149061</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kong</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name></person-group><article-title>Efficient clinical data analysis for prediction of coal workers&#x2019; pneumoconiosis using machine learning algorithms</article-title><source>Clin Respir J</source><year>2023</year><month>07</month><volume>17</volume><issue>7</issue><fpage>684</fpage><lpage>693</lpage><pub-id pub-id-type="doi">10.1111/crj.13657</pub-id><pub-id pub-id-type="medline">37380332</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Li</surname><given-names>B</given-names> </name><name name-style="western"><surname>Sha</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>C</given-names> </name></person-group><article-title>The role of platelets in central hubs of inflammation: a literature review</article-title><source>Medicine (Baltimore)</source><year>2024</year><month>05</month><day>10</day><volume>103</volume><issue>19</issue><fpage>e38115</fpage><pub-id pub-id-type="doi">10.1097/MD.0000000000038115</pub-id><pub-id pub-id-type="medline">38728509</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ye</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>W</given-names> </name></person-group><article-title>A machine learning model for predicting severe mycoplasma pneumoniae pneumonia in school-aged children</article-title><source>BMC Infect Dis</source><year>2025</year><month>04</month><day>21</day><volume>25</volume><issue>1</issue><fpage>570</fpage><pub-id pub-id-type="doi">10.1186/s12879-025-10958-8</pub-id><pub-id pub-id-type="medline">40259232</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Cross-validation curve of least absolute shrinkage and selection operator regression, least absolute shrinkage and selection operator coefficient chart, cross-validation of 6 models and receiver operating characteristic curves of the final model, receiver operating characteristic curves of optimized model, and Shapley Additive Explanation dependency graph.</p><media xlink:href="medinform_v14i1e80156_app1.docx" xlink:title="DOCX File, 17478 KB"/></supplementary-material></app-group></back></article>