<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e86700</article-id><article-id pub-id-type="doi">10.2196/86700</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Leveraging Large Language Models to Integrate Clinical Knowledge and Machine Learning Predictions for Lymph Node Metastasis Prediction: Development of a Knowledge-Augmented Framework</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Yu</surname><given-names>Hongying</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Bing</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zeng</surname><given-names>Xian</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ren</surname><given-names>Mucheng</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cao</surname><given-names>Zheng</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Xiaofeng</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lu</surname><given-names>Xudong</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Jun</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Nan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hu</surname><given-names>Danqing</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Jiangsu Key Laboratory of Intelligent Medical Image Computing, School of Artificial Intelligence, Nanjing University of Information Science and Technology</institution><addr-line>Linjiang Building, No.219, Ningliu Road</addr-line><addr-line>Nanjing</addr-line><country>China</country></aff><aff id="aff2"><institution>Key Laboratory of Carcinogenesis and Translational Research (Ministry of Education), Department of Thoracic Surgery II, Peking University Cancer Hospital and Institute</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff3"><institution>Zhejiang Lab</institution><addr-line>Hangzhou</addr-line><country>China</country></aff><aff id="aff4"><institution>College of Biomedical Engineering and Instrument Science, Zhejiang University</institution><addr-line>Hangzhou</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Ismayilov</surname><given-names>Rashad</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Xu</surname><given-names>Weilin</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Danqing Hu, PhD, Jiangsu Key Laboratory of Intelligent Medical Image Computing, School of Artificial Intelligence, Nanjing University of Information Science and Technology, Linjiang Building, No.219, Ningliu Road, Nanjing, 210044, China, 1 13291879390; <email>danqinghu@nuist.edu.cn</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>22</day><month>6</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e86700</elocation-id><history><date date-type="received"><day>29</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>25</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>03</day><month>06</month><year>2026</year></date></history><copyright-statement>&#x00A9; Hongying Yu, Bing Liu, Xian Zeng, Mecheng Ren, Zheng Cao, Xiaofeng Zhu, Xudong Lu, Jun Xu, Nan Wu, Danqing Hu. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 22.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e86700"/><abstract><sec><title>Background</title><p>Lymph node metastasis (LNM) is a critical clinical indicator for determining the initial treatment strategy for patients with lung cancer. However, accurately diagnosing LNM preoperatively remains a significant challenge. Data-driven predictive modeling has become a mainstream approach to address this issue, yet it often overlooks existing clinical knowledge. Large language models (LLMs) have demonstrated the potential to predict clinical risks in a zero-shot manner based on the extensive clinical knowledge learned from large-scale corpora.</p></sec><sec><title>Objective</title><p>LLMs have demonstrated the potential to predict clinical risks in a zero-shot manner based on the extensive clinical knowledge learned from large-scale corpora. This study aims to investigate the integration of LLM-derived knowledge with data-driven patterns to enhance the accuracy of LNM prediction.</p></sec><sec sec-type="methods"><title>Methods</title><p>We propose a novel ensemble framework that combines the strengths of LLMs and machine learning (ML) models for LNM prediction in lung cancer. Specifically, 3 ML models were trained using clinical data, and their predicted probabilities, along with the original clinical features, were incorporated into prompts for LLMs. Three LLMs&#x2014;GPT-5.4, GPT-5.4-nano, and DeepSeek-V3.2&#x2014;were used to independently predict LNM risk 5 times, and 4 ensemble strategies were applied to aggregate their predictions into a final outcome.</p></sec><sec sec-type="results"><title>Results</title><p>The proposed approach was evaluated on clinical data from 767 patients with lung cancer at Peking University Cancer Hospital. Experimental results show that our proposed framework significantly outperforms base ML models, achieving an area under the curve of 0.781 and an average precision of 0.420. Compared with the no reasoning English setting, both the reasoning English setting and nonreasoning Chinese setting showed a lower area under the curve but higher average precision.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study presents a novel knowledge-augmented strategy for integrating the clinical knowledge embedded in LLMs with the statistical patterns captured by ML models to improve the LNM prediction of lung cancer, offering a new paradigm for integrating medical knowledge and patient data in clinical predictions.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>machine learning models</kwd><kwd>lymph node metastasis</kwd><kwd>lung cancer</kwd><kwd>clinical risk prediction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Lung cancer remains the leading cause of cancer-related mortality worldwide [<xref ref-type="bibr" rid="ref1">1</xref>]. For patients with early-stage lung cancer, surgical resection represents the only potentially curative treatment [<xref ref-type="bibr" rid="ref2">2</xref>]. The determination of lymph node metastasis (LNM) is critical in assessing surgical eligibility and the need for additional neoadjuvant therapy. However, accurately diagnosing LNM preoperatively through noninvasive examinations and tests poses significant challenges in clinical practice, often leading to suboptimal treatment decisions and adversely affecting patient outcomes [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>To achieve an accurate preoperative diagnosis of LNM, data-driven approaches have become the most commonly used methods for developing LNM prediction models. Initially, researchers used patient clinical characteristics in combination with statistical methods to construct predictive models [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. To leverage imaging data, the radiomics approach was introduced, allowing the extraction of first-order, second-order, texture, and other features from the image data, which were then integrated with clinical characteristics to enhance predictive precision [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. To further explore the nonlinear relationships among these features, machine learning (ML) methods such as random forest (RF), support vector machine (SVM), and multilayer perceptron were used, resulting in improved model performance [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. With the rapid advancement of deep learning, researchers began to use deep learning techniques to automatically extract deep features from images for LNM prediction [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Unlike radiomics methods, deep learning approaches do not require manual delineation of regions of interest in the images. Instead, they can directly extract deep image features related to the prediction target through error backpropagation, making deep learning the most popular and effective approach for LNM prediction.</p><p>Recently, large language models (LLMs), such as ChatGPT [<xref ref-type="bibr" rid="ref18">18</xref>] and GPT-4 [<xref ref-type="bibr" rid="ref19">19</xref>], have captured global attention due to their impressive text generation capabilities. These models, pretrained on vast corpora, demonstrate remarkable performance on previously unseen tasks using zero-shot, one-shot, or few-shot prompts without parameter updates [<xref ref-type="bibr" rid="ref20">20</xref>]. By incorporating reinforcement learning from human feedback [<xref ref-type="bibr" rid="ref21">21</xref>], LLMs are further refined to produce content that is safe and aligns with human expectations. This success has led to a paradigm shift in natural language processing research and is gradually influencing clinical prediction research [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>Leveraging the medical knowledge learned from extensive corpora, LLMs show potential in diagnosing and evaluating patient prognoses. Many studies have investigated the capabilities of LLMs in predicting clinical outcomes such as readmission, length of stay, and hospital mortality [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. These studies typically develop prompts using patient data and instruct LLMs to provide answers for specific tasks. Although LLMs can generate predictive results based on patient information and instructions prompted, their predictive performance rarely surpasses that of traditional data-driven ML models when they only use the medical knowledge they learned from the corpora [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>In this study, we propose a novel knowledge-augmented method that integrates the medical knowledge of LLMs with the statistical patterns identified by data-driven models to predict LNM in lung cancer. This method first combines the clinical characteristics of patients with the risk probabilities predicted by ML models using prompt engineering, then ensembles the multiple responses of LLMs as the final predictions. When evaluated on real clinical data, our approach demonstrates that by combining the strengths of both knowledge-based and data-driven models, we can achieve superior predictive performance compared to using either model alone.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Patients</title><p>We collected data from 767 patients with lung cancer treated at Peking University Cancer Hospital. All patients underwent pulmonary resection with systematic mediastinal lymphadenectomy between 2010 and 2018 and received contrast-enhanced computed tomography (CT) scans and tumor biomarker tests within 2 months before surgery. Patients who received preoperative chemotherapy or radiotherapy were excluded to avoid potential confounding due to complete responses to these treatments.</p><p>The data collected included structured clinical information such as demographics and tumor biomarkers, as well as unstructured data such as disease history, CT scan, and pathological reports. A clinician annotated LNM statuses based on postoperative pathological reports, which were processed using our previously developed information extraction models [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], followed by a manual review by a clinician to ensure accuracy, which served as the gold standard labels. Data quality was further ensured through consistency checks and verification of missing values before model training.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Ethical approval for this study was granted by the ethics committee of Peking University Cancer Hospital (2022KT128) prior to its commencement. Informed consent was waived due to the retrospective design of this study. All data were stored securely. Identifiable information was removed prior to analysis, and no personally identifiable information was included in the study or its supplementary materials.</p></sec><sec id="s2-3"><title>Study Design</title><p>This study aims to integrate the advantages of LLMs and ML models to accurately predict LNM in patients with lung cancer. The overall study design is depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>First, unstructured clinical data were collected, and key features were extracted using information extraction models previously developed by our team [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. The extracted features were then reviewed by the clinicians. Next, we combined the extracted features with the tabular clinical data to develop ML models to predict the risk of LNM in patients. We then constructed prompts for LLMs using the predicted probabilities and patient clinical features and gathered several responses from LLMs using the same prompt. Finally, we integrated the various predicted results of the LLMs to generate the final ensemble results.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overall study design. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86700_fig01.png"/></fig></sec><sec id="s2-4"><title>ML Models</title><p>In this study, we selected 3 classical ML methods, that is, logistic regression (LR), RF, and SVM, as well as a transformer-based deep learning model, to identify latent patterns between patient clinical data and LNM status. We trained and tested these ML models on our clinical data. We used the test results and predicted probabilities to construct the prompt, aiming to achieve the integration of data and knowledge.</p></sec><sec id="s2-5"><title>Prompt Design</title><p>The prompt template used in this study is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><p>The proposed prompt template consists of the following 5 elements:</p><list list-type="bullet"><list-item><p>Role: this element defines the role that LLMs should assume to generate responses for specific tasks. In this study, we instructed the LLMs to act as thoracic surgeons, who generally assess a patient&#x2019;s LNM and determine whether the patient can directly receive surgical resection.</p></list-item><list-item><p>Task: This element specifies the clinical prediction task assigned to LLMs. We instructed the LLMs to predict the likelihood that a patient would have N2 LNM.</p></list-item><list-item><p>Patient data: this element outlines the patient&#x2019;s clinical data used for the evaluation by LLMs. We provided patient demographics, disease history, tumor biomarkers, and CT reports. It is important to note that the original disease history and CT reports were in Chinese free-text format; therefore, we used the Google Translate Application Programming Interface (API) via googletrans to translate them into English. Additionally, for tumor biomarkers, we supplied the reference ranges as external knowledge.</p></list-item><list-item><p>Machine learning model result: this element is used to integrate the predicted result from the data-driven model as a reference for the LLMs. We only provide the predicted probability and the model type to prevent any potential data leakage issues.</p></list-item><list-item><p>Instruction: in this element, we instructed the LLMs to initially estimate the likelihood of N2 LNM based solely on the patient data. Subsequently, they were instructed to reestimate the likelihood by considering the N2 LNM rate and the predicted probability provided by the ML model. We also used the chain-of-thought strategy to require the LLMs to reason step by step. Additionally, the LLMs were instructed to provide their responses in JSON format with key-value pairs, such as &#x201C;Step by Step Explanation&#x201D;:&#x201C;&#x003C;string&#x003E;&#x201D; and &#x201C;Answer&#x201D;:&#x201C;&#x003C;float&#x003E;.&#x201D;</p></list-item></list><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Proposed prompt template. CEA: carcinoembryonic antigen; NSE: neuron-specific enolase.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86700_fig02.png"/></fig></sec><sec id="s2-6"><title>Ensemble Models</title><p>Using the designed prompt template, we developed individualized prompts for each patient sample. We selected 3 LLMs&#x2014;GPT-5.4, GPT-5.4-nano, and Deepseek-v3.2&#x2014;to generate these responses through the official APIs.</p><p>Considering that LLMs can produce varying outputs even with identical prompts, we input the same prompt 5 times for each patient to obtain 3 distinct responses. We then applied 4 strategies&#x2014;maximum value, minimum value, median value, and mean value&#x2014;to process these 5 responses and derive the ensemble results. The complete prompt template is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p></sec><sec id="s2-7"><title>Experimental Setup</title><p>Before model training, missing values in the dataset were imputed. For categorical features (eg, smoking history, drinking history, family tumor history, gender, and comorbidities), the mode was used for imputation. For continuous features (eg, age, height, weight, tumor size, carcinoembryonic antigen [CEA], carbohydrate antigen 19-9 [CA19-9], carbohydrate antigen 125 [CA125], neuron-specific enolase [NSE], cytokeratin 19-fragments [Cyfra21-1], and squamous cell carcinoma antigen [SCCAG]), the median was used.</p><p>When developing ML models, a 10-fold cross-validation strategy was used to train and test the models. During each fold iteration, we used an additional 5-fold cross-validation to optimize hyperparameters, subsequently retraining the model on the entire training set using the best hyperparameters. The trained model was then tested on the test set to obtain the final test results. After the completion of all 10-fold iterations, we obtained 10 test results for each fold and the predicted probability of LNM for each patient. To ensure reproducibility, we set 30 as the random seed for the 10-fold stratified cross-validation and model training. Hyperparameters were optimized via grid search combined with 5-fold cross-validation. We set the class weight as &#x201C;balanced&#x201D; to address class imbalance for LR, RF, and SVM models.</p><p>All LLMs were accessed through the official APIs, and we used the default hyperparameters for response generation. Specifically, the default temperature value is 1 for DeepSeek-V3.2, GPT-5.4 (version: gpt-5.4-2026-03-05), and GPT-5.4-nano (version: gpt-5.4-nano-2026-03-17). No reasoning effort was enabled. We extracted the float values of the key &#x201C;Answer&#x201D; from the JSON format responses as the predicted probabilities of the LLMs. Then, we calculated the ensemble results based on the 5-time predictions as the final results.</p><p>In addition to the proposed approach that uses the predictions of ML models, we also evaluate the performance of LLMs alone in predicting N2 LNM. The corresponding prompt template is provided in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><p>The performance of the models was evaluated using 2 metrics: the area under the receiver operating characteristic curve (AUC) and the average precision value (AP). To test the differences in performance between models, we used the paired 2-tailed <italic>t</italic> test. A <italic>P</italic> value of less than .05 was considered statistically significant.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Baseline prompt template. CEA: carcinoembryonic antigen; NSE: neuron-specific enolase.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86700_fig03.png"/></fig></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Clinical Data</title><p>Among the 767 patients, 104 (13.6%) were confirmed to have N2 LNM according to their postoperative pathology reports. In this study, a total of 26 types of clinical features were included. Features such as spiculation, lobulation, mediastinal lymph node short axis, hilar lymph node short axis, tumor location, and tumor density were extracted from CT reports and reviewed by a clinician. <xref ref-type="table" rid="table1">Table 1</xref> presents the statistics of the clinical data.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>The statistics of the clinical data.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Clinical feature</td><td align="left" valign="top">Positive (n=104)</td><td align="left" valign="top">Negative (n=663)</td></tr></thead><tbody><tr><td align="left" valign="top">Age, mean (SD)</td><td align="left" valign="top">60.82 (9.02)</td><td align="left" valign="top">60.79 (9.53)</td></tr><tr><td align="left" valign="top">Height, mean (SD)</td><td align="left" valign="top">164.57 (6.93)</td><td align="left" valign="top">164.50 (7.92)</td></tr><tr><td align="left" valign="top">Weight, mean (SD)</td><td align="left" valign="top">66.93 (9.47)</td><td align="left" valign="top">65.59 (9.50)</td></tr><tr><td align="left" valign="top">Tumor long size, mean (SD)</td><td align="left" valign="top">3.01 (1.38)</td><td align="left" valign="top">2.56 (1.40)</td></tr><tr><td align="left" valign="top">Tumor short size, mean (SD)</td><td align="left" valign="top">2.38 (1.11)</td><td align="left" valign="top">1.99 (1.16)</td></tr><tr><td align="left" valign="top">CEA<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, mean (SD)</td><td align="left" valign="top">12.76 (21.18)</td><td align="left" valign="top">4.24 (9.53)</td></tr><tr><td align="left" valign="top">CA19-9<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, mean (SD)</td><td align="left" valign="top">15.89 (20.96)</td><td align="left" valign="top">13.95 (15.39)</td></tr><tr><td align="left" valign="top">CA125<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>, mean (SD)</td><td align="left" valign="top">19.96 (25.55)</td><td align="left" valign="top">13.47 (10.18)</td></tr><tr><td align="left" valign="top">NSE<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup>, mean (SD)</td><td align="left" valign="top">16.25 (6.10)</td><td align="left" valign="top">15.68 (7.02)</td></tr><tr><td align="left" valign="top">Cyfra21-1<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup>, mean (SD)</td><td align="left" valign="top">3.57 (4.21)</td><td align="left" valign="top">3.18 (3.34)</td></tr><tr><td align="left" valign="top">SCCAg<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, mean (SD)</td><td align="left" valign="top">1.19 (1.81)</td><td align="left" valign="top">0.93 (0.97)</td></tr><tr><td align="left" valign="top" colspan="3">Gender, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">62</td><td align="left" valign="top">322</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">42</td><td align="left" valign="top">341</td></tr><tr><td align="left" valign="top" colspan="3">Smoking history, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">55</td><td align="left" valign="top">272</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">49</td><td align="left" valign="top">391</td></tr><tr><td align="left" valign="top" colspan="3">Drinking history, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">25</td><td align="left" valign="top">151</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">79</td><td align="left" valign="top">512</td></tr><tr><td align="left" valign="top" colspan="3">Family tumor history, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">14</td><td align="left" valign="top">116</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">90</td><td align="left" valign="top">547</td></tr><tr><td align="left" valign="top" colspan="3">Hypertension, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">37</td><td align="left" valign="top">184</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">67</td><td align="left" valign="top">479</td></tr><tr><td align="left" valign="top" colspan="3">Diabetes, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">14</td><td align="left" valign="top">65</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">90</td><td align="left" valign="top">598</td></tr><tr><td align="left" valign="top" colspan="3">Tuberculosis history, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">2</td><td align="left" valign="top">29</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">102</td><td align="left" valign="top">634</td></tr><tr><td align="left" valign="top" colspan="3">Cardiovascular diseases, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">9</td><td align="left" valign="top">27</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">95</td><td align="left" valign="top">636</td></tr><tr><td align="left" valign="top" colspan="3">Cerebrovascular diseases, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">6</td><td align="left" valign="top">23</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">98</td><td align="left" valign="top">640</td></tr><tr><td align="left" valign="top" colspan="3">Spiculation, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">39</td><td align="left" valign="top">171</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">65</td><td align="left" valign="top">492</td></tr><tr><td align="left" valign="top" colspan="3">Lobulation, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">52</td><td align="left" valign="top">174</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">52</td><td align="left" valign="top">489</td></tr><tr><td align="left" valign="top" colspan="3">MLNSA<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup>, n</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x2265;10 mm</td><td align="left" valign="top">34</td><td align="left" valign="top">80</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x003C;10 mm</td><td align="left" valign="top">70</td><td align="left" valign="top">583</td></tr><tr><td align="left" valign="top" colspan="3">HLNSA<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup>, n</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x2265;10 mm</td><td align="left" valign="top">23</td><td align="left" valign="top">71</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x003C;10 mm</td><td align="left" valign="top">81</td><td align="left" valign="top">592</td></tr><tr><td align="left" valign="top" colspan="3">Tumor location, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RUL<sup><xref ref-type="table-fn" rid="table1fn9">i</xref></sup></td><td align="left" valign="top">27</td><td align="left" valign="top">209</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RML<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup></td><td align="left" valign="top">4</td><td align="left" valign="top">54</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RLL<sup><xref ref-type="table-fn" rid="table1fn11">k</xref></sup></td><td align="left" valign="top">18</td><td align="left" valign="top">129</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LUL<sup><xref ref-type="table-fn" rid="table1fn12">l</xref></sup></td><td align="left" valign="top">27</td><td align="left" valign="top">140</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLL<sup><xref ref-type="table-fn" rid="table1fn13">m</xref></sup></td><td align="left" valign="top">21</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Others</td><td align="left" valign="top">7</td><td align="left" valign="top">31</td></tr><tr><td align="left" valign="top" colspan="3">Tumor density, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Solid</td><td align="left" valign="top">101</td><td align="left" valign="top">457</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>mGGO<sup><xref ref-type="table-fn" rid="table1fn14">n</xref></sup></td><td align="left" valign="top">3</td><td align="left" valign="top">92</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GGO<sup><xref ref-type="table-fn" rid="table1fn15">o</xref></sup></td><td align="left" valign="top">0</td><td align="left" valign="top">114</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>CEA: carcinoembryonic antigen.</p></fn><fn id="table1fn2"><p><sup>b</sup>CA19-9: carbohydrate antigen 19-9.</p></fn><fn id="table1fn3"><p><sup>c</sup>CA125: carbohydrate antigen 125.</p></fn><fn id="table1fn4"><p><sup>d</sup>NSE: neuron-specific enolase.</p></fn><fn id="table1fn5"><p><sup>e</sup>Cyfra21-1: cytokeratin 19-fragments.</p></fn><fn id="table1fn6"><p><sup>f</sup>SCCAg: squamous cell carcinoma antigen.</p></fn><fn id="table1fn7"><p><sup>g</sup>MLNSA: mediastinal lymph node short axis.</p></fn><fn id="table1fn8"><p><sup>h</sup>HLNSA: hilar lymph node short axis.</p></fn><fn id="table1fn9"><p><sup>i</sup>RUL: right upper lobe.</p></fn><fn id="table1fn10"><p><sup>j</sup>RML: right middle lobe.</p></fn><fn id="table1fn11"><p><sup>k</sup>RLL: right lower lobe.</p></fn><fn id="table1fn12"><p><sup>l</sup>LUL: left upper lobe.</p></fn><fn id="table1fn13"><p><sup>m</sup>LLL: left lower lobe.</p></fn><fn id="table1fn14"><p><sup>n</sup>mGGO: mixed ground-glass opacity.</p></fn><fn id="table1fn15"><p><sup>o</sup>GGO: ground-glass opacity.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Predictive Performance</title><p><xref ref-type="table" rid="table2">Table 2</xref> presents the predictive performance of the baseline ML models and the proposed LLM-based integration framework. Overall, incorporating LLMs with ML predictions consistently improves model performance, particularly in terms of AUC across different base learners.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>The area under the curve (AUC) and average precision (AP) values of the baseline machine learning (ML) models and the proposed models.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Models</td><td align="left" valign="bottom" colspan="2">AUC</td><td align="left" valign="bottom" colspan="2">AP</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Mean (SD)</td><td align="left" valign="top"><italic>P</italic> value</td><td align="left" valign="top">Mean (SD)</td><td align="left" valign="top"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">LR<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0.759 (0.038)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.387 (0.079)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5">GPT-5.4 nano+LR</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.770 (0.041)</td><td align="left" valign="top">.003</td><td align="left" valign="top">0.402 (0.084)</td><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.774 (0.048)</td><td align="left" valign="top">.04</td><td align="left" valign="top">0.414 (0.094)</td><td align="left" valign="top">.11</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.768 (0.042)</td><td align="left" valign="top">.02</td><td align="left" valign="top">0.408 (0.099)</td><td align="left" valign="top">.17</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.772 (0.044)</td><td align="left" valign="top">.003</td><td align="left" valign="top">0.420 (0.088)</td><td align="left" valign="top">.02</td></tr><tr><td align="left" valign="top" colspan="5">GPT-5.4+LR</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.775 (0.053)</td><td align="left" valign="top">.08</td><td align="left" valign="top">0.410 (0.092)</td><td align="left" valign="top">.27</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.777 (0.055)</td><td align="left" valign="top">.07</td><td align="left" valign="top">0.416 (0.094)</td><td align="left" valign="top">.25</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.778 (0.053)</td><td align="left" valign="top">.05</td><td align="left" valign="top">0.417 (0.094)</td><td align="left" valign="top">.20</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.777 (0.054)</td><td align="left" valign="top">.05</td><td align="left" valign="top">0.425 (0.091)</td><td align="left" valign="top">.13</td></tr><tr><td align="left" valign="top" colspan="5">Deepseek-v3.2+LR</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.775 (0.042)</td><td align="left" valign="top">.12</td><td align="left" valign="top">0.407 (0.082)</td><td align="left" valign="top">.08</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.771 (0.061)</td><td align="left" valign="top">.28</td><td align="left" valign="top">0.415 (0.095)</td><td align="left" valign="top">.32</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.779 (0.050)</td><td align="left" valign="top">.10</td><td align="left" valign="top">0.416 (0.090)</td><td align="left" valign="top">.13</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.776 (0.051)</td><td align="left" valign="top">.13</td><td align="left" valign="top">0.425 (0.096)</td><td align="left" valign="top">.06</td></tr><tr><td align="left" valign="top" colspan="5">RF<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0.752 (0.057)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.402 (0.113)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5">GPT-5.4 nano+RF</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.770 (0.062)</td><td align="left" valign="top">.03</td><td align="left" valign="top">0.395 (0.106)</td><td align="left" valign="top">.69</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.773 (0.070)</td><td align="left" valign="top">.02</td><td align="left" valign="top">0.405 (0.112)</td><td align="left" valign="top">.90</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.782 (0.067)</td><td align="left" valign="top">.002</td><td align="left" valign="top">0.405 (0.113)</td><td align="left" valign="top">.90</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.781 (0.064)</td><td align="left" valign="top">.003</td><td align="left" valign="top">0.415 (0.112)</td><td align="left" valign="top">.53</td></tr><tr><td align="left" valign="top" colspan="5">GPT-5.4+RF</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.770 (0.068)</td><td align="left" valign="top">.18</td><td align="left" valign="top">0.389 (0.093)</td><td align="left" valign="top">.67</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.768 (0.062)</td><td align="left" valign="top">.27</td><td align="left" valign="top">0.388 (0.091)</td><td align="left" valign="top">.64</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.773 (0.060)</td><td align="left" valign="top">.12</td><td align="left" valign="top">0.395 (0.094)</td><td align="left" valign="top">.82</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.772 (0.063)</td><td align="left" valign="top">.13</td><td align="left" valign="top">0.405 (0.095)</td><td align="left" valign="top">.93</td></tr><tr><td align="left" valign="top" colspan="5">Deepseek-v3.2+RF</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.764 (0.062)</td><td align="left" valign="top">.32</td><td align="left" valign="top">0.337 (0.091)</td><td align="left" valign="top">.09</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.756 (0.072)</td><td align="left" valign="top">.81</td><td align="left" valign="top">0.358 (0.079)</td><td align="left" valign="top">.20</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.757 (0.070)</td><td align="left" valign="top">.70</td><td align="left" valign="top">0.356 (0.101)</td><td align="left" valign="top">.21</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.763 (0.066)</td><td align="left" valign="top">.34</td><td align="left" valign="top">0.368 (0.098)</td><td align="left" valign="top">.34</td></tr><tr><td align="left" valign="top" colspan="5">SVM<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0.749 (0.331)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.379 (0.066)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5">GPT-5.4 nano+SVM</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.674 (0.055)</td><td align="left" valign="top">.17</td><td align="left" valign="top">0.375 (0.094)</td><td align="left" valign="top">.78</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.764 (0.039)</td><td align="left" valign="top">.01</td><td align="left" valign="top">0.382 (0.074)</td><td align="left" valign="top">.66</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.770 (0.046)</td><td align="left" valign="top">.02</td><td align="left" valign="top">0.381 (0.073)</td><td align="left" valign="top">.75</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.767 (0.047)</td><td align="left" valign="top">.04</td><td align="left" valign="top">0.387 (0.075)</td><td align="left" valign="top">.43</td></tr><tr><td align="left" valign="top" colspan="5">GPT-5.4+SVM</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.769 (0.045)</td><td align="left" valign="top">.08</td><td align="left" valign="top">0.365 (0.057)</td><td align="left" valign="top">.45</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.771 (0.044)</td><td align="left" valign="top">.046</td><td align="left" valign="top">0.395 (0.065)</td><td align="left" valign="top">.35</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.774 (0.047)</td><td align="left" valign="top">.06</td><td align="left" valign="top">0.386 (0.060)</td><td align="left" valign="top">.70</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.772 (0.045)</td><td align="left" valign="top">.06</td><td align="left" valign="top">0.389 (0.057)</td><td align="left" valign="top">.60</td></tr><tr><td align="left" valign="top" colspan="5">Deepseek-v3.2+SVM</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.773 (0.048)</td><td align="left" valign="top">.17</td><td align="left" valign="top">0.358 (0.062)</td><td align="left" valign="top">.40</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.773 (0.047)</td><td align="left" valign="top">.10</td><td align="left" valign="top">0.382 (0.060)</td><td align="left" valign="top">.88</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.771 (0.050)</td><td align="left" valign="top">.20</td><td align="left" valign="top">0.364 (0.062)</td><td align="left" valign="top">.51</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.774 (0.049)</td><td align="left" valign="top">.15</td><td align="left" valign="top">0.388 (0.069)</td><td align="left" valign="top">.73</td></tr><tr><td align="left" valign="top" colspan="5">Transformer</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0.739 (0.056)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.332 (0.070)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5">GPT-5.4 nano+Transformer</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.754 (0.047)</td><td align="left" valign="top">.16</td><td align="left" valign="top">0.346 (0.064)</td><td align="left" valign="top">.16</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.752 (0.051)</td><td align="left" valign="top">.04</td><td align="left" valign="top">0.356 (0.085)</td><td align="left" valign="top">.06</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.751 (0.046)</td><td align="left" valign="top">.11</td><td align="left" valign="top">0.346 (0.077)</td><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.755 (0.046)</td><td align="left" valign="top">.06</td><td align="left" valign="top">0.357 (0.072)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top" colspan="5">GPT-5.4+Transformer</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.760 (0.050)</td><td align="left" valign="top">.09</td><td align="left" valign="top">0.375 (0.073)</td><td align="left" valign="top">.02</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.767 (0.045)</td><td align="left" valign="top">.02</td><td align="left" valign="top">0.389 (0.061)</td><td align="left" valign="top">.002</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.762 (0.046)</td><td align="left" valign="top">.05</td><td align="left" valign="top">0.375 (0.067)</td><td align="left" valign="top">.02</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.763 (0.047)</td><td align="left" valign="top">.06</td><td align="left" valign="top">0.378 (0.069)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top" colspan="5">Deepseek-v3.2+Transformer</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.754 (0.050)</td><td align="left" valign="top">.48</td><td align="left" valign="top">0.367 (0.082)</td><td align="left" valign="top">.23</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.765 (0.050)</td><td align="left" valign="top">.14</td><td align="left" valign="top">0.371 (0.071)</td><td align="left" valign="top">.07</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.758 (0.037)</td><td align="left" valign="top">.23</td><td align="left" valign="top">0.360 (0.064)</td><td align="left" valign="top">.24</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.759 (0.044)</td><td align="left" valign="top">.25</td><td align="left" valign="top">0.373 (0.063)</td><td align="left" valign="top">.09</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>LR: logistic regression.</p></fn><fn id="table2fn2"><p><sup>b</sup>Not applicable.</p></fn><fn id="table2fn3"><p><sup>c</sup>RF: random forest.</p></fn><fn id="table2fn4"><p><sup>d</sup>SVM: support vector machine.</p></fn></table-wrap-foot></table-wrap><p>When leveraging predictions from the LR model, the proposed approach achieved statistically significant improvements in AUC across multiple ensemble strategies. For example, GPT-5.4 nano combined with LR achieved higher AUC values under the max, min, median, and mean strategies (all <italic>P</italic>&#x003C;.05), with the mean-ensemble also showing a significant improvement in AP (<italic>P</italic>=.02). Similar trends were observed for GPT-5.4 and Deepseek-v3.2, where consistent improvements in AUC and AP were achieved, although not all reached statistical significance. Using predictions from the RF model, GPT-5.4 nano demonstrated the most notable improvements, with significant gains in AUC under max, min, median, and mean ensemble strategies (all <italic>P</italic>&#x003C;.05), achieving the highest AUC of 0.782. However, improvements in AP were generally limited and not statistically significant. For the SVM model, the proposed framework again improved AUC, particularly for GPT-5.4 nano under the min, median, and mean ensemble strategies (<italic>P</italic>&#x003C;.05). In contrast, improvements in AP were modest and did not reach statistical significance. When using the transformer model as the base learner, the LLM-based approach also led to consistent improvements in both AUC and AP. Notably, GPT-5.4 achieved statistically significant gains in AP across multiple ensemble strategies (eg, min, median, and mean), and GPT-5.4 nano with the mean ensemble also showed a significant improvement in AP (<italic>P</italic>=.01). The AUC and AP values of each iteration of the baseline ML models and proposed models are listed in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The sensitivity, specificity, positive predictive value, and negative predictive value of the base ML models and the proposed models are listed in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To further evaluate the effectiveness of the ensemble strategy, we compared the proposed models with the stand-alone LLMs, ML models, and the conventional stacking model. As shown in <xref ref-type="table" rid="table3">Table 3</xref>, stand-alone LLMs exhibited relatively unstable performance, with noticeable variability between the worst and best responses (eg, GPT-5.4 nano AUC: 0.737&#x2010;0.750; AP: 0.296&#x2010;0.321), and overall inferior performance compared to ML models.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>The area under the curve (AUC) and average precision (AP) values of the baseline machine learning (ML) models, stand-alone large language models (LLMs), stacking model, and the proposed models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Models</td><td align="left" valign="bottom">AUC, mean (SD)</td><td align="left" valign="bottom">AP, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">LR<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">0.759 (0.038)</td><td align="left" valign="top">0.387 (0.079)</td></tr><tr><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">0.752 (0.057)</td><td align="left" valign="top">0.402 (0.113)</td></tr><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">0.749 (0.331)</td><td align="left" valign="top">0.379 (0.066)</td></tr><tr><td align="left" valign="top">Transformer</td><td align="left" valign="top">0.739 (0.056)</td><td align="left" valign="top">0.332 (0.070)</td></tr><tr><td align="left" valign="top">Stacking (LR+RF+SVM+Transformer)</td><td align="left" valign="top">0.767 (0.052)</td><td align="left" valign="top">0.386 (0.082)</td></tr><tr><td align="left" valign="top" colspan="3">GPT-5.4 nano</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Worst</td><td align="left" valign="top">0.737 (0.065)</td><td align="left" valign="top">0.296 (0.056)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Best</td><td align="left" valign="top">0.750 (0.060)</td><td align="left" valign="top">0.321 (0.060)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.744 (0.064)</td><td align="left" valign="top">0.325 (0.081)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.739 (0.06)</td><td align="left" valign="top">0.299 (0.06)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.744 (0.059)</td><td align="left" valign="top">0.31 (0.071)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.749 (0.061)</td><td align="left" valign="top">0.335 (0.078)</td></tr><tr><td align="left" valign="top">GPT-5.4</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Worst</td><td align="left" valign="top">0.749 (0.053)</td><td align="left" valign="top">0.333 (0.065)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Best</td><td align="left" valign="top">0.764 (0.047)</td><td align="left" valign="top">0.350 (0.070)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.758 (0.053)</td><td align="left" valign="top">0.345 (0.067)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.756 (0.049)</td><td align="left" valign="top">0.347 (0.067)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.756 (0.053)</td><td align="left" valign="top">0.346 (0.071)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.756 (0.052)</td><td align="left" valign="top">0.349 (0.068)</td></tr><tr><td align="left" valign="top" colspan="3">Deepseek-v3.2</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Worst</td><td align="left" valign="top">0.725 (0.058)</td><td align="left" valign="top">0.293 (0.062)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Best</td><td align="left" valign="top">0.746 (0.060)</td><td align="left" valign="top">0.315 (0.078)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top">0.732 (0.064)</td><td align="left" valign="top">0.291 (0.063)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top">0.735 (0.061)</td><td align="left" valign="top">0.301 (0.084)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top">0.742 (0.06)</td><td align="left" valign="top">0.312 (0.08)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top">0.747 (0.061)</td><td align="left" valign="top">0.334 (0.092)</td></tr><tr><td align="left" valign="top">GPT-5.4 nano+LR mean</td><td align="left" valign="top">0.772 (0.044)</td><td align="left" valign="top">0.420 (0.088)</td></tr><tr><td align="left" valign="top">GPT-5.4 nano+RF mean</td><td align="left" valign="top">0.781 (0.064)</td><td align="left" valign="top">0.415 (0.112)</td></tr><tr><td align="left" valign="top">GPT-5.4+SVM min</td><td align="left" valign="top">0.771 (0.044)</td><td align="left" valign="top">0.395 (0.065)</td></tr><tr><td align="left" valign="top">GPT-5.4+Transformer min</td><td align="left" valign="top">0.767 (0.045)</td><td align="left" valign="top">0.389 (0.061)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>LR: logistic regression.</p></fn><fn id="table3fn2"><p><sup>b</sup>RF: random forest.</p></fn><fn id="table3fn3"><p><sup>c</sup>SVM: support vector machine.</p></fn></table-wrap-foot></table-wrap><p>Applying simple aggregation strategies (eg, max, min, median, and mean) slightly improved the stability of LLM predictions, but their performance remained below that of traditional ML baselines. In contrast, the conventional stacking approach combining LR, RF, SVM, and transformer achieved moderate improvement (AUC: 0.767) but did not consistently outperform the best individual models in terms of AP.</p><p>Notably, when integrating LLMs with ML predictions, the proposed framework achieved further performance gains. We selected the models with the best AUC and AP values to compare with the baselines. GPT-5.4 nano combined with RF (mean ensemble) achieved the highest AUC (0.781) and improved AP (0.415), while GPT-5.4 nano+LR (mean) also showed substantial gains (AUC: 0.772, AP: 0.420). Similar improvements were observed for GPT-5.4+SVM and transformer-based combinations. Additionally, we presented the calibration curves and decision curve analysis of the selected models in Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>These results indicate that, while stand-alone LLM predictions are unstable and conventional stacking provides limited gains, the proposed LLM-based integration framework can more effectively leverage complementary information from both data-driven models and LLM reasoning, resulting in more robust and improved predictive performance. We also provide the sensitivity, specificity, positive predictive value, and negative predictive value of these models in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To further investigate the impact of reasoning and language settings, we compared GPT-5.4 nano under 3 configurations: nonreasoning (English), reasoning (English), and nonreasoning (Chinese) across different base models and ensemble strategies. Since the original clinical data were in Chinese, the use of English prompts required translation, which may have introduced potential errors. To rigorously assess this risk, we conducted a human evaluation in which a clinician reviewed 100 translated prompts using a 5-point Likert scale (1=&#x201C;incorrect or unusable&#x201D; to 5=&#x201C;fully accurate and clinically appropriate&#x201D;). The results showed that 67% (67/100) of the samples were rated as 5, 30% (30/100) as 4, and only 3% (3/100) as 3, with no samples rated below 3, yielding an average score of 4.64. This indicates that the translated prompts generally preserved the original clinical meaning with high fidelity. The experimental results are summarized in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>The area under the curve (AUC) and average precision (AP) values of the proposed models with different reasoning and language configurations.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">GPT-5.4 nano</td><td align="left" valign="bottom" colspan="4">Nonreasoning (English), mean (SD)</td><td align="left" valign="bottom" colspan="4">Reasoning (English), mean (SD)</td><td align="left" valign="bottom" colspan="4">Nonreasoning (Chinese), mean (SD)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top" colspan="2">AUC</td><td align="left" valign="top" colspan="2">AP</td><td align="left" valign="top" colspan="2">AUC</td><td align="left" valign="top" colspan="2">AP</td><td align="left" valign="top" colspan="2">AUC</td><td align="left" valign="top" colspan="2">AP</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="13">LLM<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>+LR<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top" colspan="2">0.770 (0.041)</td><td align="left" valign="top" colspan="2">0.402 (0.084)</td><td align="left" valign="top" colspan="2">0.771 (0.042)</td><td align="left" valign="top" colspan="2">0.413 (0.094)</td><td align="left" valign="top" colspan="2">0.762 (0.051)</td><td align="left" valign="top" colspan="2">0.410 (0.075)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top" colspan="2">0.774 (0.048)</td><td align="left" valign="top" colspan="2">0.414 (0.094)</td><td align="left" valign="top" colspan="2">0.774 (0.045)</td><td align="left" valign="top" colspan="2">0.418 (0.104)</td><td align="left" valign="top" colspan="2">0.779 (0.053)</td><td align="left" valign="top" colspan="2">0.431 (0.100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top" colspan="2">0.768 (0.042)</td><td align="left" valign="top" colspan="2">0.408 (0.099)</td><td align="left" valign="top" colspan="2">0.777 (0.042)</td><td align="left" valign="top" colspan="2">0.433 (0.088)</td><td align="left" valign="top" colspan="2">0.769 (0.053)</td><td align="left" valign="top" colspan="2">0.418 (0.089)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top" colspan="2">0.772 (0.044)</td><td align="left" valign="top" colspan="2">0.420 (0.088)</td><td align="left" valign="top" colspan="2">0.773 (0.040)</td><td align="left" valign="top" colspan="2">0.430 (0.088)</td><td align="left" valign="top" colspan="2">0.772 (0.054)</td><td align="left" valign="top" colspan="2">0.428 (0.091)</td></tr><tr><td align="left" valign="top" colspan="13">LLM+RF<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top" colspan="2">0.770 (0.062)</td><td align="left" valign="top" colspan="2">0.395 (0.106)</td><td align="left" valign="top" colspan="2">0.778 (0.051)</td><td align="left" valign="top" colspan="2">0.386 (0.099)</td><td align="left" valign="top" colspan="2">0.772 (0.066)</td><td align="left" valign="top" colspan="2">0.410 (0.105)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top" colspan="2">0.773 (0.070)</td><td align="left" valign="top" colspan="2">0.405 (0.112)</td><td align="left" valign="top" colspan="2">0.775 (0.070)</td><td align="left" valign="top" colspan="2">0.399 (0.094)</td><td align="left" valign="top" colspan="2">0.770 (0.063)</td><td align="left" valign="top" colspan="2">0.422 (0.109)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top" colspan="2">0.782 (0.067)</td><td align="left" valign="top" colspan="2">0.405 (0.113)</td><td align="left" valign="top" colspan="2">0.774 (0.057)</td><td align="left" valign="top" colspan="2">0.395 (0.083)</td><td align="left" valign="top" colspan="2">0.769 (0.066)</td><td align="left" valign="top" colspan="2">0.411 (0.109)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top" colspan="2">0.781 (0.064)</td><td align="left" valign="top" colspan="2">0.415 (0.112)</td><td align="left" valign="top" colspan="2">0.776 (0.056)</td><td align="left" valign="top" colspan="2">0.400 (0.082)</td><td align="left" valign="top" colspan="2">0.773 (0.069)</td><td align="left" valign="top" colspan="2">0.426 (0.112)</td></tr><tr><td align="left" valign="top" colspan="13">LLM+SVM<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top" colspan="2">0.674 (0.055)</td><td align="left" valign="top" colspan="2">0.375 (0.094)</td><td align="left" valign="top" colspan="2">0.769 (0.051)</td><td align="left" valign="top" colspan="2">0.382 (0.078)</td><td align="left" valign="top" colspan="2">0.756 (0.064)</td><td align="left" valign="top" colspan="2">0.364 (0.101)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top" colspan="2">0.764 (0.039)</td><td align="left" valign="top" colspan="2">0.382 (0.074)</td><td align="left" valign="top" colspan="2">0.777 (0.043)</td><td align="left" valign="top" colspan="2">0.413 (0.091)</td><td align="left" valign="top" colspan="2">0.752 (0.046)</td><td align="left" valign="top" colspan="2">0.370 (0.071)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top" colspan="2">0.770 (0.046)</td><td align="left" valign="top" colspan="2">0.381 (0.073)</td><td align="left" valign="top" colspan="2">0.770 (0.046)</td><td align="left" valign="top" colspan="2">0.387 (0.092)</td><td align="left" valign="top" colspan="2">0.766 (0.070)</td><td align="left" valign="top" colspan="2">0.397 (0.103)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top" colspan="2">0.767 (0.047)</td><td align="left" valign="top" colspan="2">0.387 (0.075)</td><td align="left" valign="top" colspan="2">0.773 (0.047)</td><td align="left" valign="top" colspan="2">0.400 (0.095)</td><td align="left" valign="top" colspan="2">0.765 (0.065)</td><td align="left" valign="top" colspan="2">0.394 (0.097)</td></tr><tr><td align="left" valign="top" colspan="13">LLM+Transformer</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Max</td><td align="left" valign="top" colspan="2">0.754 (0.047)</td><td align="left" valign="top" colspan="2">0.346 (0.064)</td><td align="left" valign="top" colspan="2">0.752 (0.056)</td><td align="left" valign="top" colspan="2">0.357 (0.072)</td><td align="left" valign="top" colspan="2">0.752 (0.053)</td><td align="left" valign="top" colspan="2">0.354 (0.074)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Min</td><td align="left" valign="top" colspan="2">0.752 (0.051)</td><td align="left" valign="top" colspan="2">0.356 (0.085)</td><td align="left" valign="top" colspan="2">0.762 (0.046)</td><td align="left" valign="top" colspan="2">0.381 (0.077)</td><td align="left" valign="top" colspan="2">0.756 (0.051)</td><td align="left" valign="top" colspan="2">0.369 (0.083)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Median</td><td align="left" valign="top" colspan="2">0.751 (0.046)</td><td align="left" valign="top" colspan="2">0.346 (0.077)</td><td align="left" valign="top" colspan="2">0.759 (0.044)</td><td align="left" valign="top" colspan="2">0.367 (0.065)</td><td align="left" valign="top" colspan="2">0.755 (0.052)</td><td align="left" valign="top" colspan="2">0.366 (0.075)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mean</td><td align="left" valign="top" colspan="2">0.755 (0.046)</td><td align="left" valign="top" colspan="2">0.357 (0.072)</td><td align="left" valign="top" colspan="2">0.758 (0.048)</td><td align="left" valign="top" colspan="2">0.371 (0.070)</td><td align="left" valign="top" colspan="2">0.756 (0.050)</td><td align="left" valign="top" colspan="2">0.364 (0.076)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table4fn2"><p><sup>b</sup>LR: logistic regression.</p></fn><fn id="table4fn3"><p><sup>c</sup>RF: random forest.</p></fn><fn id="table4fn4"><p><sup>d</sup>SVM: support vector machine.</p></fn></table-wrap-foot></table-wrap><p>To provide a clearer comparison, we identified the best-performing models under each setting based on both AUC and AP. Specifically, the optimal model in the nonreasoning (English) setting was LLM+RF with mean ensemble (AUC=0.781; AP=0.415), in the reasoning (English) setting was LLM+LR with median ensemble (AUC=0.777; AP=0.433), and in the nonreasoning (Chinese) setting was LLM+LR with min ensemble (AUC=0.779; AP: 0.431). Overall, these best results are highly comparable across the three settings. The highest AUC was achieved by the nonreasoning English configuration (0.781), while the highest AP was observed in the reasoning setting (0.433), with the Chinese setting yielding a very similar AP (0.431).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>In this study, we propose a knowledge-augmented prediction framework that integrates ML model outputs with LLM-derived clinical knowledge for preoperative prediction of N2 LNM in patients with lung cancer. Consistent with our study objective, the results demonstrate that incorporating LLM-based refinement into data-driven predictions leads to consistent improvements in predictive performance across multiple base models and ensemble strategies. Specifically, the proposed framework achieved the best performance with an AUC of 0.781 and an AP of 0.420, outperforming stand-alone ML models as well as a conventional stacking approach.</p></sec><sec id="s4-2"><title>LLMs as Knowledge-Informed Calibrators</title><p>In contrast to stand-alone LLM predictions, which were relatively unstable and generally inferior to ML models, the integrated framework consistently improved performance. These findings suggest that the primary value of LLMs in this setting lies not in independent prediction, but in post hoc refinement of model outputs through the incorporation of clinical context. Unlike zero-shot or few-shot prediction paradigms used in prior studies, our framework positions LLMs as knowledge-informed calibrators, refining ML predictions based on their own evaluation of patient-specific information.</p><p>The cases presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> further support this interpretation. We observed that the LLM can adjust predictions in both directions depending on the clinical context&#x2014;for example, down-weighting overestimated risks when key radiological signs are absent and up-weighting underestimated risks when clinically significant features are present. From a methodological perspective, this behavior can be interpreted as introducing clinical prior knowledge into the prediction process, complementing the cohort-specific statistical patterns learned by ML models.</p><p>To further assess whether the LLM-generated reasoning is clinically meaningful rather than spurious, we conducted a clinician-based evaluation of the step-by-step reasoning traces. Specifically, a clinician reviewed 100 cases and rated the reasoning quality using a 5-point Likert scale, considering logical coherence, factual correctness, and potential hallucinations. The results showed that 91% of the reasoning traces were rated as 5, and the remaining 9% as 4, with no cases rated as moderate or poor quality. These findings suggest that the reasoning processes elicited by the &#x201C;step-by-step&#x201D; prompting strategy are generally clinically coherent and medically sound, rather than arbitrary explanations fitted to the final prediction. This supports the interpretation that the LLM contributes meaningful clinical context when refining model outputs, although it does not fully eliminate the possibility of subtle reasoning errors.</p></sec><sec id="s4-3"><title>Effect of Ensemble Strategies, Reasoning Modes, and Language</title><p>In this study, we explored multiple ensemble strategies to identify a robust aggregation approach. Overall, the results demonstrate that the proposed framework can effectively enhance predictive performance across different base models. While statistically significant improvements were more consistently observed with GPT-5.4 nano, GPT-5.4, and Deepseek-v3.2 achieved comparable AUC and AP values but did not consistently reach statistical significance, likely due to greater variability across cross-validation folds. Importantly, across models and settings, the mean ensemble consistently performed among the best or near-best strategies. This suggests that mean aggregation represents a practical and robust candidate for a universal ensemble strategy, as it provides a favorable balance between performance, stability, and simplicity.</p><p>We also investigated the effects of reasoning mode and input language on predictive performance. The results indicate that enabling reasoning mode and using Chinese prompts tend to slightly improve AP, potentially by better capturing positive cases. From a practical perspective, the reasoning mode introduces additional token consumption and computational cost, which should be carefully considered in real-world deployment. Therefore, selecting between reasoning and nonreasoning configurations involves a trade-off between predictive performance (especially AP) and computational efficiency. Meanwhile, the comparable performance between English and Chinese prompts indicates that the model is robust to language variations, offering flexibility for practical clinical applications.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations that should be acknowledged.</p><p>First, this is a single-center retrospective study for 1 clinical task based on 767 patients from one institution, without external validation. Although we used nested cross-validation to improve internal robustness, the generalizability of the findings remains uncertain. Future work should include multicenter and prospective validation and expand to other clinical tasks. Second, given the multiple comparisons conducted between the baseline and the proposed models, the reported <italic>P</italic> values should be interpreted with caution, and emphasis should be placed on consistent performance trends across models. Third, although clinician evaluation suggests that the generated reasoning and English translation are generally of high quality, this assessment was conducted on a limited sample and may not fully capture all potential failure modes. These evaluations, while providing preliminary evidence for feasibility and acceptability, should be interpreted with caution. Further large-scale and multiexpert evaluation would be needed to more rigorously assess the reliability of LLM-generated clinical reasoning and translation. Fourth, this study did not incorporate image data to create a multimodal prediction task. Some studies have explored the use of LLMs like GPT-4 to diagnose diseases using image data; however, they did not show competitive performance in interpreting real-world medical images [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref35">35</xref>]. Future research should investigate how to integrate image data to further improve the predictive performance of LLMs. Finally, fine-tuning LLMs may be a possible way to further improve their predictive ability for clinical risk prediction. However, designing the ground truth label for fine-tuning is challenging when predicting the probability of a clinical problem, as the real label is binary. We will try to explore this question in the future.</p></sec><sec id="s4-5"><title>Conclusions</title><p>In this study, we propose a knowledge-augmented framework that integrates LLM-derived clinical knowledge with data-driven model predictions for LNM risk estimation. The results suggest that LLMs can act as knowledge-informed calibrators, combining statistical patterns with clinically relevant prior knowledge to improve prediction performance. These findings suggest that LLMs can excel in clinical risk prediction tasks, offering a new paradigm for integrating medical knowledge and patient data in clinical predictions.</p></sec></sec></body><back><ack><p>The authors disclose that a large language model was used only for preliminary English-language editing of the manuscript. The authors carefully reviewed and verified the entire manuscript and remain fully responsible for the accuracy, originality, and integrity of all content in the manuscript, including all references and citations.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the National Natural Science Foundation of China (82402447), the Beijing Natural Science Foundation (L222020), the Startup Foundation for Introducing Talent of NUIST (2025r027), the National Key R\&#x0026;D Program of China (No.2022YFC2406804), the Capital&#x2019;s funds for health improvement and research (No.2024-1-1023), and the National Ten-thousand Talent Program.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AP</term><def><p>average precision</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb4">CA125</term><def><p>carbohydrate antigen 125</p></def></def-item><def-item><term id="abb5">CA19-9</term><def><p>carbohydrate antigen 19-9</p></def></def-item><def-item><term id="abb6">CEA</term><def><p> carcinoembryonic antigen</p></def></def-item><def-item><term id="abb7">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb8">Cyfra211</term><def><p>cytokeratin 19-fragments</p></def></def-item><def-item><term id="abb9">LNM</term><def><p>lymph node metastasis</p></def></def-item><def-item><term id="abb10">LR</term><def><p>logistic regression</p></def></def-item><def-item><term id="abb11">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb12">NSE</term><def><p>neuron-specific enolase</p></def></def-item><def-item><term id="abb13">RF</term><def><p>random forest</p></def></def-item><def-item><term id="abb14">SCCAg</term><def><p> squamous cell carcinoma antigen</p></def></def-item><def-item><term id="abb15">SVM</term><def><p>support vector machine</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sung</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ferlay</surname><given-names>J</given-names> </name><name name-style="western"><surname>Siegel</surname><given-names>RL</given-names> </name><etal/></person-group><article-title>Global Cancer Statistics 2020: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title><source>CA Cancer J Clin</source><year>2021</year><month>05</month><volume>71</volume><issue>3</issue><fpage>209</fpage><lpage>249</lpage><pub-id pub-id-type="doi">10.3322/caac.21660</pub-id><pub-id pub-id-type="medline">33538338</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Howington</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Blum</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Balekian</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Murthy</surname><given-names>SC</given-names> </name></person-group><article-title>Treatment of stage I and II non-small cell lung cancer: diagnosis and management of lung cancer, 3rd ed: American College of Chest Physicians evidence-based clinical practice guidelines</article-title><source>Chest</source><year>2013</year><month>05</month><volume>143</volume><issue>5 Suppl</issue><fpage>e278S</fpage><lpage>e313S</lpage><pub-id pub-id-type="doi">10.1378/chest.12-2359</pub-id><pub-id pub-id-type="medline">23649443</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Navani</surname><given-names>N</given-names> </name><name name-style="western"><surname>Fisher</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Tierney</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Stephens</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Burdett</surname><given-names>S</given-names> </name><collab>NSCLC Meta-analysis Collaborative Group</collab></person-group><article-title>The accuracy of clinical staging of stage I-IIIa non-small cell lung cancer: an analysis based on individual participant data</article-title><source>Chest</source><year>2019</year><month>03</month><volume>155</volume><issue>3</issue><fpage>502</fpage><lpage>509</lpage><pub-id pub-id-type="doi">10.1016/j.chest.2018.10.020</pub-id><pub-id pub-id-type="medline">30391190</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Farjah</surname><given-names>F</given-names> </name><name name-style="western"><surname>Lou</surname><given-names>F</given-names> </name><name name-style="western"><surname>Sima</surname><given-names>C</given-names> </name><name name-style="western"><surname>Rusch</surname><given-names>VW</given-names> </name><name name-style="western"><surname>Rizk</surname><given-names>NP</given-names> </name></person-group><article-title>A prediction model for pathologic N2 disease in lung cancer patients with a negative mediastinum by positron emission tomography</article-title><source>J Thorac Oncol</source><year>2013</year><month>09</month><volume>8</volume><issue>9</issue><fpage>1170</fpage><lpage>1180</lpage><pub-id pub-id-type="doi">10.1097/JTO.0b013e3182992421</pub-id><pub-id pub-id-type="medline">23945387</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name></person-group><article-title>Development and validation of a clinical prediction model for N2 lymph node metastasis in non-small cell lung cancer</article-title><source>Ann Thorac Surg</source><year>2013</year><month>11</month><volume>96</volume><issue>5</issue><fpage>1761</fpage><lpage>1768</lpage><pub-id pub-id-type="doi">10.1016/j.athoracsur.2013.06.038</pub-id><pub-id pub-id-type="medline">23998401</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>She</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>D</given-names> </name><etal/></person-group><article-title>A texture analysis-based prediction model for lymph node metastasis in stage Ia lung adenocarcinoma</article-title><source>Ann Thorac Surg</source><year>2018</year><month>07</month><volume>106</volume><issue>1</issue><fpage>214</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.1016/j.athoracsur.2018.02.026</pub-id><pub-id pub-id-type="medline">29550204</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name></person-group><article-title>Radiomics-based predictive risk score: a scoring system for preoperatively predicting risk of lymph node metastasis in patients with resectable non-small cell lung cancer</article-title><source>Chin J Cancer Res</source><year>2019</year><month>08</month><volume>31</volume><issue>4</issue><fpage>641</fpage><lpage>652</lpage><pub-id pub-id-type="doi">10.21147/j.issn.1000-9604.2019.04.08</pub-id><pub-id pub-id-type="medline">31564807</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Can peritumoral radiomics increase the efficiency of the prediction for lymph node metastasis in clinical stage T1 lung adenocarcinoma on CT?</article-title><source>Eur Radiol</source><year>2019</year><month>11</month><volume>29</volume><issue>11</issue><fpage>6049</fpage><lpage>6058</lpage><pub-id pub-id-type="doi">10.1007/s00330-019-06084-0</pub-id><pub-id pub-id-type="medline">30887209</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Nan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>N</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Z</given-names> </name></person-group><article-title>MA05.11 radiomics analysis using SVM predicts mediastinal lymph nodes status of Squamous Cell Lung Cancer by pre-treatment chest CT scan</article-title><source>J Thorac Oncol</source><year>2018</year><month>10</month><volume>13</volume><issue>10</issue><fpage>S374</fpage><pub-id pub-id-type="doi">10.1016/j.jtho.2018.08.357</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>JL</given-names> </name><etal/></person-group><article-title>Development of a predictive radiomics model for lymph node metastases in pre-surgical CT-based stage IA non-small cell lung cancer</article-title><source>Lung Cancer</source><year>2020</year><month>01</month><volume>139</volume><fpage>73</fpage><lpage>79</lpage><pub-id pub-id-type="doi">10.1016/j.lungcan.2019.11.003</pub-id><pub-id pub-id-type="medline">31743889</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yoo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cheon</surname><given-names>M</given-names> </name><name name-style="western"><surname>Park</surname><given-names>YJ</given-names> </name><etal/></person-group><article-title>Machine learning-based diagnostic method of pre-therapeutic <sup>18</sup>F-FDG PET/CT for evaluating mediastinal lymph nodes in non-small cell lung cancer</article-title><source>Eur Radiol</source><year>2021</year><month>06</month><volume>31</volume><issue>6</issue><fpage>4184</fpage><lpage>4194</lpage><pub-id pub-id-type="doi">10.1007/s00330-020-07523-z</pub-id><pub-id pub-id-type="medline">33241521</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name></person-group><article-title>Using natural language processing and machine learning to preoperatively predict lymph node metastasis for non-small cell lung cancer with electronic medical records: development and validation study</article-title><source>JMIR Med Inform</source><year>2022</year><month>04</month><day>25</day><volume>10</volume><issue>4</issue><fpage>e35475</fpage><pub-id pub-id-type="doi">10.2196/35475</pub-id><pub-id pub-id-type="medline">35468085</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>W</given-names> </name><etal/></person-group><article-title>A cross-modal 3D deep learning for accurate lymph node metastasis prediction in clinical stage T1 lung adenocarcinoma</article-title><source>Lung Cancer</source><year>2020</year><month>07</month><volume>145</volume><fpage>10</fpage><lpage>17</lpage><pub-id pub-id-type="doi">10.1016/j.lungcan.2020.04.014</pub-id><pub-id pub-id-type="medline">32387813</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Comparison of machine learning methods for classifying mediastinal lymph node metastasis of non-small cell lung cancer from <sup>18</sup>F-FDG PET/CT images</article-title><source>EJNMMI Res</source><year>2017</year><month>12</month><volume>7</volume><issue>1</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1186/s13550-017-0260-9</pub-id><pub-id pub-id-type="medline">28130689</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>YW</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>HC</given-names> </name><etal/></person-group><article-title>Dual energy CT image prediction on primary tumor of lung cancer for nodal metastasis using deep learning</article-title><source>Comput Med Imaging Graph</source><year>2021</year><month>07</month><volume>91</volume><fpage>101935</fpage><pub-id pub-id-type="doi">10.1016/j.compmedimag.2021.101935</pub-id><pub-id pub-id-type="medline">34090261</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name></person-group><article-title>A multi-modal heterogeneous graph forest to predict lymph node metastasis of non-small cell lung cancer</article-title><source>IEEE J Biomed Health Inform</source><year>2023</year><month>03</month><volume>27</volume><issue>3</issue><fpage>1216</fpage><lpage>1224</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2022.3233387</pub-id><pub-id pub-id-type="medline">37018304</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>L</given-names> </name><etal/></person-group><article-title>A deep multi-task network to learn tumor pathological representations for lymph node metastasis prediction</article-title><source>Stud Health Technol Inform</source><year>2024</year><month>01</month><day>25</day><volume>310</volume><fpage>906</fpage><lpage>910</lpage><pub-id pub-id-type="doi">10.3233/SHTI231096</pub-id><pub-id pub-id-type="medline">38269940</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Introducing ChatGPT</article-title><source>OpenAI</source><access-date>2026-06-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/blog/chatgpt">https://openai.com/blog/chatgpt</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Gpt-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><access-date>2026-06-09</access-date><conf-name>NIPS&#x2019;20: the 34th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.5555/3495724.3495883">https://dl.acm.org/doi/abs/10.5555/3495724.3495883</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ouyang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Training language models to follow instructions with human feedback</article-title><access-date>2026-06-09</access-date><conf-name>36th Conference on Neural Information Processing Systems (NeurIPS 2022)</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2022/file/b1efde53be364a73914f58805a001731-Paper-Conference.pdf">https://proceedings.neurips.cc/paper_files/paper/2022/file/b1efde53be364a73914f58805a001731-Paper-Conference.pdf</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Idnay</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Evaluating large language models on medical evidence summarization</article-title><source>NPJ Digit Med</source><year>2023</year><month>08</month><day>24</day><volume>6</volume><issue>1</issue><fpage>158</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00896-7</pub-id><pub-id pub-id-type="medline">37620423</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Improving large language models for clinical named entity recognition via prompt engineering</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1812</fpage><lpage>1820</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad259</pub-id><pub-id pub-id-type="medline">38281112</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Doshi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Amin</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Khosla</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bajaj</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Chheang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Forman</surname><given-names>HP</given-names> </name></person-group><article-title>Quantitative evaluation of large language models to streamline radiology report impressions: a multimodal retrospective analysis</article-title><source>Radiology</source><year>2024</year><month>03</month><volume>310</volume><issue>3</issue><fpage>e231593</fpage><pub-id pub-id-type="doi">10.1148/radiol.231593</pub-id><pub-id pub-id-type="medline">38530171</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name></person-group><article-title>The current status of large language models in summarizing radiology report impressions</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 4, 2024</comment><pub-id pub-id-type="doi">10.2196/preprints.65547</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>N</given-names> </name></person-group><article-title>Zero-shot information extraction from radiological reports using ChatGPT</article-title><source>Int J Med Inform</source><year>2024</year><month>03</month><volume>183</volume><fpage>105321</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105321</pub-id><pub-id pub-id-type="medline">38157785</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chung</surname><given-names>P</given-names> </name><name name-style="western"><surname>Fong</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Walters</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Aghaeepour</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yetisgen</surname><given-names>M</given-names> </name><name name-style="western"><surname>O&#x2019;Reilly-Shah</surname><given-names>VN</given-names> </name></person-group><article-title>Large language model capabilities in perioperative risk prediction and prognostication</article-title><source>JAMA Surg</source><year>2024</year><month>08</month><day>1</day><volume>159</volume><issue>8</issue><fpage>928</fpage><lpage>937</lpage><pub-id pub-id-type="doi">10.1001/jamasurg.2024.1621</pub-id><pub-id pub-id-type="medline">38837145</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Timsina</surname><given-names>P</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Evaluating the accuracy of a state-of-the-art large language model for prediction of admissions from the emergency room</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1921</fpage><lpage>1928</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae103</pub-id><pub-id pub-id-type="medline">38771093</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Evaluation of GPT-4 for 10-year cardiovascular risk prediction: insights from the UK Biobank and KoGES data</article-title><source>iScience</source><year>2024</year><month>02</month><day>16</day><volume>27</volume><issue>2</issue><fpage>109022</fpage><pub-id pub-id-type="doi">10.1016/j.isci.2024.109022</pub-id><pub-id pub-id-type="medline">38357664</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Prompting large language models for zero-shot clinical prediction with structured longitudinal electronic health record data</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.01713</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name></person-group><article-title>Automatic extraction of lung cancer staging information from computed tomography reports: deep learning approach</article-title><source>JMIR Med Inform</source><year>2021</year><month>07</month><day>21</day><volume>9</volume><issue>7</issue><fpage>e27955</fpage><pub-id pub-id-type="doi">10.2196/27955</pub-id><pub-id pub-id-type="medline">34287213</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>R</given-names> </name><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name></person-group><article-title>Multimodal ChatGPT for medical applications: an experimental study of GPT-4V</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 29, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.19061</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nakao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Miki</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Capability of GPT-4V(ision) in the Japanese National Medical Licensing Examination: evaluation study</article-title><source>JMIR Med Educ</source><year>2024</year><month>03</month><day>12</day><volume>10</volume><fpage>e54393</fpage><pub-id pub-id-type="doi">10.2196/54393</pub-id><pub-id pub-id-type="medline">38470459</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kennedy</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Evaluating GPT-V4 (GPT-4 with Vision) on detection of radiologic findings on chest radiographs</article-title><source>Radiology</source><year>2024</year><month>05</month><volume>311</volume><issue>2</issue><fpage>e233270</fpage><pub-id pub-id-type="doi">10.1148/radiol.233270</pub-id><pub-id pub-id-type="medline">38713028</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Barash</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Assessing GPT-4 multimodal performance in radiological image analysis</article-title><source>Eur Radiol</source><year>2025</year><month>04</month><volume>35</volume><issue>4</issue><fpage>1959</fpage><lpage>1965</lpage><pub-id pub-id-type="doi">10.1007/s00330-024-11035-5</pub-id><pub-id pub-id-type="medline">39214893</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompt and response examples, area under the curve (AUC) and average precision (AP) values of each iteration of the proposed models, the sensitivity, specificity, positive predictive value, and negative predictive value and the receiver operating characteristic curve (ROC), precision-recall (PR), calibration and decision curve analysis curves of the baseline and proposed models.</p><media xlink:href="medinform_v14i1e86700_app1.docx" xlink:title="DOCX File, 5167 KB"/></supplementary-material></app-group></back></article>