<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v13i1e63731</article-id>
      <article-id pub-id-type="pmid">39793017</article-id>
      <article-id pub-id-type="doi">10.2196/63731</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Qwen-2.5 Outperforms Other Large Language Models in the Chinese National Nursing Licensing Examination: Retrospective Cross-Sectional Comparative Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Castonguay</surname>
            <given-names>Alexandre</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hirano</surname>
            <given-names>Yuichiro</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Besler</surname>
            <given-names>Muhammed</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Zhu</surname>
            <given-names>Shiben</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0846-0453</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>Wanqin</given-names>
          </name>
          <degrees>MNS</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2548-5801</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Zhi</given-names>
          </name>
          <degrees>MNS</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-7354-5879</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Yan</surname>
            <given-names>Jiani</given-names>
          </name>
          <degrees>MIM</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-5373-0305</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Fang</given-names>
          </name>
          <degrees>BBA</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <address>
            <institution>Department of Science and Education</institution>
            <institution>Shenzhen Baoan Women's and Children's Hospital</institution>
            <addr-line>56 Yulu Road</addr-line>
            <addr-line>Xin'an Street, Bao'an District</addr-line>
            <addr-line>Shenzhen, 518001</addr-line>
            <country>China</country>
            <phone>86 13686891225</phone>
            <email>zhangfangf11@163.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-8263-0755</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Infectious Diseases, Nanfang Hospital</institution>
        <institution>Southern Medical University</institution>
        <addr-line>Guangzhou</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>State Key Laboratory of Organ Failure Research</institution>
        <institution>Key Laboratory of Infectious Diseases Research in South China, Ministry of Education, Guangdong Provincial Key Laboratory of Viral Hepatitis Research</institution>
        <institution>Guangdong Provincial Clinical Research Center for Viral Hepatitis, Guangdong Institute of Hepatology</institution>
        <addr-line>Guangzhou</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>School of Nursing and Health Studies</institution>
        <institution>Hong Kong Metropolitan University</institution>
        <institution>Kowloon</institution>
        <addr-line>Hong Kong</addr-line>
        <country>China (Hong Kong)</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Science and Education</institution>
        <institution>Shenzhen Baoan Women's and Children's Hospital</institution>
        <addr-line>Shenzhen</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Fang Zhang <email>zhangfangf11@163.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>10</day>
        <month>1</month>
        <year>2025</year>
      </pub-date>
      <volume>13</volume>
      <elocation-id>e63731</elocation-id>
      <history>
        <date date-type="received">
          <day>27</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>26</day>
          <month>7</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>6</day>
          <month>8</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>20</day>
          <month>12</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Shiben Zhu, Wanqin Hu, Zhi Yang, Jiani Yan, Fang Zhang. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 10.01.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2025/1/e63731" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models (LLMs) have been proposed as valuable tools in medical education and practice. The Chinese National Nursing Licensing Examination (CNNLE) presents unique challenges for LLMs due to its requirement for both deep domain–specific nursing knowledge and the ability to make complex clinical decisions, which differentiates it from more general medical examinations. However, their potential application in the CNNLE remains unexplored.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to evaluates the accuracy of 7 LLMs including GPT-3.5, GPT-4.0, GPT-4o, Copilot, ERNIE Bot-3.5, SPARK, and Qwen-2.5 on the CNNLE, focusing on their ability to handle domain-specific nursing knowledge and clinical decision-making. We also explore whether combining their outputs using machine learning techniques can improve their overall accuracy.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>This retrospective cross-sectional study analyzed all 1200 multiple-choice questions from the CNNLE conducted between 2019 and 2023. Seven LLMs were evaluated on these multiple-choice questions, and 9 machine learning models, including Logistic Regression, Support Vector Machine, Multilayer Perceptron, k-nearest neighbors, Random Forest, LightGBM, AdaBoost, XGBoost, and CatBoost, were used to optimize overall performance through ensemble techniques.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Qwen-2.5 achieved the highest overall accuracy of 88.9%, followed by GPT-4o (80.7%), ERNIE Bot-3.5 (78.1%), GPT-4.0 (70.3%), SPARK (65.0%), and GPT-3.5 (49.5%). Qwen-2.5 demonstrated superior accuracy in the Practical Skills section compared with the Professional Practice section across most years. It also performed well in brief clinical case summaries and questions involving shared clinical scenarios. When the outputs of the 7 LLMs were combined using 9 machine learning models, XGBoost yielded the best performance, increasing accuracy to 90.8%. XGBoost also achieved an area under the curve of 0.961, sensitivity of 0.905, specificity of 0.978, <italic>F</italic><sub>1</sub>-score of 0.901, positive predictive value of 0.901, and negative predictive value of 0.977.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study is the first to evaluate the performance of 7 LLMs on the CNNLE and that the integration of models via machine learning significantly boosted accuracy, reaching 90.8%. These findings demonstrate the transformative potential of LLMs in revolutionizing health care education and call for further research to refine their capabilities and expand their impact on examination preparation and professional training.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language models</kwd>
        <kwd>LLMs</kwd>
        <kwd>Chinese National Nursing Licensing Examination</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>Qwen-2.5</kwd>
        <kwd>multiple-choice questions</kwd>
        <kwd/>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Nursing licensure examinations are essential for maintaining professional standards, ensuring that health care systems are staffed with qualified professionals, and safeguarding patient safety [<xref ref-type="bibr" rid="ref1">1</xref>]. These examinations assess nurses’ clinical judgment, decision-making, and practical skills, ensuring high-quality care and fostering public trust in the profession [<xref ref-type="bibr" rid="ref2">2</xref>]. Upholding rigorous standards is critical, as competent health care professionals are crucial for addressing the diverse and complex needs of patients worldwide. The Chinese National Nursing Licensing Examination (CNNLE) plays an important role in maintaining high standards of nursing care in China, ensuring that graduates are well prepared for professional practice [<xref ref-type="bibr" rid="ref3">3</xref>]. Serving as a benchmark for nursing competence, the CNNLE confirms that nurses possess the necessary skills and knowledge to provide safe and effective care [<xref ref-type="bibr" rid="ref4">4</xref>]. Beyond its impact on health care quality, the CNNLE also influences educational policies, guiding nursing curricula to meet evolving health care demands. As health care becomes more complex, innovative tools are needed to support the development of skilled professionals capable of providing effective patient care.</p>
      <p>The integration of artificial intelligence (AI) in education is transforming learning and assessment, particularly in fields such as nursing [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. ChatGPT, an AI tool that generates content by identifying patterns in its training data, simulates human-like conversations and answers questions across a wide range of topics [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Its ability to provide correct answers and offer immediate, detailed feedback makes it a valuable resource for students in simulated test environments and question banks [<xref ref-type="bibr" rid="ref10">10</xref>]. This success in examination settings has sparked interest in using ChatGPT as a self-learning tool, suggesting its potential for enhancing examination preparation and knowledge development [<xref ref-type="bibr" rid="ref11">11</xref>]. Large language models (LLMs) hold promise for clinical education [<xref ref-type="bibr" rid="ref12">12</xref>], where these models integrate natural language processing with user-friendly interfaces [<xref ref-type="bibr" rid="ref13">13</xref>]. In clinics, LLMs are increasingly valuable, particularly in diagnosis [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>] and clinical licensing examinations [<xref ref-type="bibr" rid="ref16">16</xref>], where accuracy is crucial. Tools such as ChatGPT are being recognized for their potential to enhance clinical documentation [<xref ref-type="bibr" rid="ref17">17</xref>], improve diagnostic accuracy [<xref ref-type="bibr" rid="ref18">18</xref>], and streamline patient care workflows [<xref ref-type="bibr" rid="ref19">19</xref>]. However, the rapid development of LLMs presents significant challenges in assessing their reliability in the CNNLE.</p>
      <p>Passing CNNLE demands not only theoretical knowledge but also clinical decision-making, critical thinking, and practical skills, areas where LLMs often underperform [<xref ref-type="bibr" rid="ref9">9</xref>]. While tools such as ChatGPT have demonstrated an overall accuracy of 80.75% in nursing education [<xref ref-type="bibr" rid="ref20">20</xref>], their effectiveness diminishes with complex, context-specific questions requiring nuanced medical knowledge [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Moreover, concerns regarding patient privacy [<xref ref-type="bibr" rid="ref24">24</xref>] and biases [<xref ref-type="bibr" rid="ref25">25</xref>] in LLM outputs raise questions about their suitability for high-stakes assessments such as the CNNLE, which emphasize fairness and accuracy [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Despite the growing interest in LLMs for medical education, their potential in the CNNLE remains unexplored. Limited understanding exists regarding their ability to handle clinical reasoning, contextual interpretation, and multistep problem-solving in this specific setting. Addressing this gap is crucial to assess their reliability, limitations, and transformative potential in clinical education. Here, this study examines the distribution of question types in the CNNLE from 2019 to 2023 and evaluates the accuracy of 7 LLMs including GPT-3.5, GPT-4.0, GPT-4o, Copilot, ERNIE Bot-3.5, SPARK, and Qwen-2.5—in addressing domain-specific nursing knowledge and clinical decision-making. Furthermore, the study explores whether combining their outputs through machine learning techniques can enhance overall accuracy in this context.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>This retrospective cross-sectional study evaluated the performance of 7 LLMs on 1200 multiple-choice questions (MCQs) from the CNNLE administered between 2019 and 2023. The study design was chosen for its suitability in systematically analyzing preexisting datasets and providing the capabilities of LLMs across various question types and levels of complexity. A head-to-head evaluation approach was adopted to compare the LLMs. Each MCQ was independently input into each model under identical conditions, ensuring consistency and fairness in the assessment. This parallel evaluation minimized variability caused by external factors, such as differences in question formats or content, allowing for a direct comparison of performance across all models. By using historical data and using a head-to-head evaluation, this study provides an analysis of LLM performance in nursing licensure examinations, offering their potential applications in nursing education and assessment.</p>
      </sec>
      <sec>
        <title>Data Collection</title>
        <p>This study analyzed all 1200 MCQs from the CNNLE administered between 2019 and 2023. Each year, 240 MCQs were included, encompassing the 4 question types (A1, A2, A3, and A4) although their proportions varied annually. This comprehensive approach ensured that the evaluation covered diverse question formats and varying levels of complexity, reflecting the full scope of the CNNLE. To ensure the integrity of the evaluation process, 2 researchers (SZ and WH) independently entered each question into 7 LLMs on separate computers. Each question was input into a new chat session to prevent any influence from prior interactions. The LLMs generated answers and explanations solely based on the input questions without pretraining instructions or additional prompts.</p>
        <p>If inconsistencies were detected in the responses, a third computer was used to reenter the question in a fresh chat session after clearing the LLMs’ memory. In such cases, the models were instructed to provide more detailed explanations. The researchers then collectively reviewed the answers and explanations to determine the most accurate and contextually appropriate response. When LLMs exhibited confusion, failed to provide explanations, produced multiple answers including the correct one, or encountered specialized queries (eg, questions on local policies), additional instructions were provided. These instructions included prompts such as, “This is a single-choice question. Please select the most suitable or probable answer from options 1 to 5,” “Please choose the incorrect option,” “Tell me the reason why,” “In Chinese local policy,” “In Chinese local law,” and “In Chinese society.” All data generated or analyzed during this study are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, and the iPython Jupyter notebook code is available in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The evaluations were conducted between May 15, 2024, and July 17, 2024. All responses were cross-verified against the official CNNLE answer keys. These measures enhanced the reliability and validity of the evaluation process. As this study was purely analytical and did not involve human participants, institutional review board approval and informed consent were not required. All collected data were fully anonymized by removing names, contact details, and other direct identifiers, ensuring no means to reidentify participants.</p>
      </sec>
      <sec>
        <title>Measurements</title>
        <sec>
          <title>The CNNLE</title>
          <p>The CNNLE [<xref ref-type="bibr" rid="ref28">28</xref>] comprises 2 sections: Professional Practice and Practical Skills, each with 120 questions per unit. The Professional Practice section evaluates a candidate’s ability to implement nursing-related knowledge in clinical settings in a safe and effective manner. It covers medical knowledge related to health and disease, basic nursing skills, and the application of social and humanistic knowledge in nursing practice. The Practical Skills section assesses candidates’ capability to apply nursing knowledge and skills in performing nursing tasks. Topics include clinical manifestations of diseases, treatment principles, health assessment, nursing procedures, professional nursing techniques, and health education. The examination format involves objective questions presented in a computer-based format.</p>
          <p>The examination includes 4 question types: A1, A2, A3, and A4, all of which are MCQs. A1 and A2 questions are relatively straightforward, focusing on single knowledge points and brief clinical case summaries, respectively. A3 and A4 questions involve shared clinical scenarios, requiring candidates to analyze and synthesize information comprehensively. A3 questions present 2-3 distinct, patient-centered clinical situations, while A4 questions depict more complex scenarios involving a single patient or family, with 4-6 independent questions that may introduce new information sequentially to test clinical integration skills.</p>
        </sec>
        <sec>
          <title>LLM Selection</title>
          <p>We selected 7 LLMs including GPT-3.5, GPT-4.0, GPT-4o, Copilot, ERNIE Bot-3.5, SPARK, and Qwen-2.5. This diverse selection enabled a comprehensive examination of LLM performance under standardized conditions. GPT-3.5, developed by OpenAI and released in March 2022, is known for generating coherent and contextually relevant text. GPT-4.0, released by OpenAI in March 2023, offers significant improvements in accuracy and understanding. GPT-4o, introduced in May 2024, is an optimized version of GPT-4.0, designed for enhanced performance. ERNIE Bot-3.5, created by Baidu and released in June 2023, is tailored for understanding and generating text in Chinese. SPARK, developed by iFLYTEK and launched in May 2023, enhances performance tools by providing intelligent assistance. Qwen-2.5, created by Alibaba and launched in May 2024, is optimized for complex language understanding, particularly in shopping and customer support contexts. To ensure effectiveness and reliability, each inquiry was conducted only once in a new chat session with each LLM, using 2 different computers. This approach aimed to evaluate the designs’ efficiency in real-world situations without the influence of responses loopholes.</p>
        </sec>
        <sec>
          <title>Machine Learning Models</title>
          <p>We selected 9 machine learning models, each with recognized performance in classification tasks. Logistic Regression (LR) [<xref ref-type="bibr" rid="ref29">29</xref>] is a fundamental linear model made use of for binary category tasks as a result of its simpleness and interpretability. Support Vector Machine (SVM) [<xref ref-type="bibr" rid="ref30">30</xref>] excels in high-dimensional and complicated settings, giving durable classification efficiency. Multilayer Perceptron (MLP) [<xref ref-type="bibr" rid="ref31">31</xref>] is a neural network model that properly identifies complicated patterns with its split structure. The k-nearest neighbors (KNN) [<xref ref-type="bibr" rid="ref32">32</xref>] algorithm is an uncomplicated, nonparametric monitored knowing approach that categorizes or predicts information factors based upon their proximity to bordering points, extensively acknowledged for its simplicity and effectiveness in both category and regression jobs.</p>
          <p>Ensemble models improve prediction performance by incorporating multiple designs to alleviate overfitting and improve generalization. Random Forest (RF) [<xref ref-type="bibr" rid="ref33">33</xref>] is esteemed for its high precision and ability to alleviate overfitting with ensemble knowing, accumulating the predictions of choice trees using a majority ballot to enhance anticipating toughness. Light Gradient-Boosting Machine (LightGBM) [<xref ref-type="bibr" rid="ref34">34</xref>] is a highly reliable gradient increasing framework that makes use of a histogram-based technique to bin constant features, speeding up training speed, enhancing memory usage, and mastering processing massive datasets with impressive rate and effectiveness. Adaptive Boosting (AdaBoost) [<xref ref-type="bibr" rid="ref35">35</xref>] prioritizes tough situations, enhancing category precision by iteratively changing weights to boost the design. Extreme Gradient Boosting (XGBoost) [<xref ref-type="bibr" rid="ref36">36</xref>], a sophisticated slope-boosting system developed by Chen, iteratively refines designs by splitting tree nodes and suitable residuals, demonstrating extraordinary scalability and superior efficiency throughout varied applications. CatBoost [<xref ref-type="bibr" rid="ref37">37</xref>], introduced in 2018, is a sophisticated gradient boosting algorithm known for its outstanding handling of specific functions, reduced training times, and the use of a money-grubbing technique to pinpoint ideal tree divides, thereby improving forecast precision.</p>
        </sec>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>The statistical analysis was conducted using Python 3.11.5 (Python Software Foundation) within the Microsoft Visual Studio Code environment. In preparing the dataset, responses where the LLMs failed to provide any answer were categorized as missing values and coded as –1. For valid responses labeled (A, B, C, D, and E), a numerical encoding scheme was applied, converting them to (1, 2, 3, 4, and 5), respectively. To prepare the data for machine learning algorithms, the dataset underwent normalization, scaling all features to a range between 0 and 1 using the MinMaxScaler from the Scikit-learn library. Descriptive statistics were used to analyze the distribution of question types within the CNNLE dataset from 2019 to 2023. Furthermore, accuracy percentages for the LLMs were computed across 2 distinct subjects and 4 different question types. Various machine learning models were then used with the objective of enhancing predictive performance.</p>
        <p>Nine machine learning models, including LR, SVM, MLP, KNN, RF, LightGBM, AdaBoost, XGBoost, and CatBoost, were trained specifically for this task using the processed CNNLE dataset. None of the models were pretrained; instead, they were trained and optimized using hyperparameter tuning tailored to the dataset. For instance, parameters such as the number of trees and maximum depth were adjusted for RF, while learning rates and boosting parameters were optimized for LightGBM and XGBoost. The leave-one-out cross-validation method was used to ensure robustness and reliability. The dataset was split into training (90%) and testing (10%) sets, with the training set further divided into 9 subsets for hyperparameter tuning. This iterative process was repeated until each subset served as a validation set, minimizing overfitting and ensuring robust performance metrics for the models.</p>
        <p>Model performance was assessed using correlation heatmaps, area under the curves (AUCs), and 7 evaluation metrics: AUC, sensitivity, specificity, <italic>F</italic><sub>1</sub>-score, accuracy, positive predictive value (PPV), and negative predictive value (NPV). Feature importance was analyzed using Shapley Additive Explanations (SHAP), providing the contributions of individual features. SHAP analysis focused on understanding the relative contributions of the 7 LLMs, highlighting how each LLM’s accuracy influenced overall predictions. The analysis used Python packages, including pandas 2.1.4, numpy 1.24.3, scikit-learn 1.3.0, scipy 1.11.4, catboost 1.2, LightGBM 4.1.0, seaborn 0.12.2, SHAP 0.42.1, and matplotlib 3.8.0.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Distribution of Question Types in the CNNLE Over the Years</title>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates the distribution of question types over the years in both sections of the CNNLE. <xref rid="figure1" ref-type="fig">Figure 1</xref>A depicts the distribution of question types over the years in the Practical Skills section. In the Practical Skills section, A1-type questions decreased from 86 in 2019 to 59 in 2023, while A2-type questions increased from 18 to 43. A3-type and A4-type questions showed smaller fluctuations. <xref rid="figure1" ref-type="fig">Figure 1</xref>B shows the distribution of question types over the years in the Professional Practice section. In the Professional Practice section, A1-type questions fell from 67 in 2019 to 55 in 2023, while A2-type questions increased from 33 to 45. A3-type questions remained relatively stable, with minor variations.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Distribution of question types in CNNLE Professional Practice and Practical Skills sections (2019-2023). (A) Distribution of question types in Practical Skills from 2019 to 2023. (B) Distribution of question types in Professional Practice from 2019 to 2023.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63731_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Accuracy of LLMs in Professional Practice</title>
        <p><xref ref-type="table" rid="table1">Table 1</xref> presents the accuracy of LLMs in the Professional Practice section from 2019 to 2023. In 2023, Qwen-2.5 achieved the highest accuracy (0.850), followed by ERNIE Bot-3.5 (0.808) and GPT-4o (0.783). GPT-4.0 consistently outperformed GPT-3.5 in all years, with scores of 0.725 and 0.492, respectively, in 2023. Copilot and SPARK also showed moderate performance improvements over time, reaching 0.775 and 0.692 in 2023. Across the 5 years, Qwen-2.5 demonstrated the best overall accuracy (0.875), followed by GPT-4o (0.803) and ERNIE Bot-3.5 (0.785).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Accuracy of large language models in Professional Practice.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="140"/>
            <col width="120"/>
            <col width="120"/>
            <col width="120"/>
            <col width="120"/>
            <col width="140"/>
            <col width="120"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Year</td>
                <td>GPT-3.5</td>
                <td>GPT-4.0</td>
                <td>GPT-4o</td>
                <td>Copilot</td>
                <td>ERNIE Bot-3.5</td>
                <td>SPARK</td>
                <td>Qwen-2.5</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>2023</td>
                <td>0.492</td>
                <td>0.725</td>
                <td>0.783</td>
                <td>0.775</td>
                <td>0.808</td>
                <td>0.692</td>
                <td>0.850</td>
              </tr>
              <tr valign="top">
                <td>2022</td>
                <td>0.450</td>
                <td>0.675</td>
                <td>0.833</td>
                <td>0.767</td>
                <td>0.758</td>
                <td>0.667</td>
                <td>0.917</td>
              </tr>
              <tr valign="top">
                <td>2021</td>
                <td>0.517</td>
                <td>0.683</td>
                <td>0.817</td>
                <td>0.725</td>
                <td>0.783</td>
                <td>0.650</td>
                <td>0.900</td>
              </tr>
              <tr valign="top">
                <td>2020</td>
                <td>0.500</td>
                <td>0.708</td>
                <td>0.725</td>
                <td>0.733</td>
                <td>0.767</td>
                <td>0.600</td>
                <td>0.850</td>
              </tr>
              <tr valign="top">
                <td>2019</td>
                <td>0.550</td>
                <td>0.758</td>
                <td>0.858</td>
                <td>0.583</td>
                <td>0.808</td>
                <td>0.600</td>
                <td>0.858</td>
              </tr>
              <tr valign="top">
                <td>Overall</td>
                <td>0.502</td>
                <td>0.710</td>
                <td>0.803</td>
                <td>0.717</td>
                <td>0.785</td>
                <td>0.642</td>
                <td>0.875</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Accuracy of LLMs in Practical Skills</title>
        <p><xref ref-type="table" rid="table2">Table 2</xref> presents the accuracy of LLMs in the Practical Skills section from 2019 to 2023. In 2023, Qwen-2.5 achieved the highest accuracy (0.908), followed by GPT-4o (0.833) and Copilot (0.792). GPT-4.0 and ERNIE Bot-3.5 both scored 0.775, showing steady improvement compared with earlier years. SPARK and GPT-3.5 performed moderately, with scores of 0.758 and 0.550, respectively. Over the 5 years, Qwen-2.5 consistently outperformed other models, achieving the highest overall accuracy (0.903). GPT-4o followed with 0.810, while ERNIE Bot-3.5 ranked third with 0.777.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Accuracy of large language models in Practical Skills.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="120"/>
            <col width="120"/>
            <col width="120"/>
            <col width="110"/>
            <col width="140"/>
            <col width="120"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Year</td>
                <td>GPT-3.5</td>
                <td>GPT-4.0</td>
                <td>GPT-4o</td>
                <td>Copilot</td>
                <td>ERNIE Bot-3.5</td>
                <td>SPARK</td>
                <td>Qwen-2.5</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>2023</td>
                <td>0.550</td>
                <td>0.775</td>
                <td>0.833</td>
                <td>0.792</td>
                <td>0.775</td>
                <td>0.758</td>
                <td>0.908</td>
              </tr>
              <tr valign="top">
                <td>2022</td>
                <td>0.467</td>
                <td>0.692</td>
                <td>0.800</td>
                <td>0.792</td>
                <td>0.792</td>
                <td>0.675</td>
                <td>0.850</td>
              </tr>
              <tr valign="top">
                <td>2021</td>
                <td>0.467</td>
                <td>0.708</td>
                <td>0.850</td>
                <td>0.667</td>
                <td>0.750</td>
                <td>0.567</td>
                <td>0.942</td>
              </tr>
              <tr valign="top">
                <td>2020</td>
                <td>0.475</td>
                <td>0.642</td>
                <td>0.783</td>
                <td>0.592</td>
                <td>0.800</td>
                <td>0.700</td>
                <td>0.933</td>
              </tr>
              <tr valign="top">
                <td>2019</td>
                <td>0.483</td>
                <td>0.658</td>
                <td>0.783</td>
                <td>0.458</td>
                <td>0.767</td>
                <td>0.592</td>
                <td>0.883</td>
              </tr>
              <tr valign="top">
                <td>Overall</td>
                <td>0.488</td>
                <td>0.695</td>
                <td>0.810</td>
                <td>0.660</td>
                <td>0.777</td>
                <td>0.658</td>
                <td>0.903</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Accuracy of LLMs for Question Types</title>
        <p><xref ref-type="table" rid="table3">Table 3</xref> indicates the accuracy of LLMs across 4 question types (A1, A2, A3, and A4) from 2019 to 2023. In 2023, Qwen-2.5 achieved the highest accuracy for A1, A2, and A3 questions (0.860, 0.909, and 0.853, respectively), while all models reached perfect accuracy (1.000) for A4 questions. GPT-4o consistently performed well across all question types, ranking second or third in accuracy. In 2022, Qwen-2.5 maintained high performance across A1, A2, and A3 questions (0.913, 0.810, and 0.963, respectively). From 2019 to 2021, Qwen-2.5 demonstrated steady improvements across all question types. Overall, Qwen-2.5 achieved the highest average accuracy (0.889), followed by GPT-4o (0.807) and ERNIE Bot-3.5 (0.781).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Accuracy of large language models for 4 question types.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="0"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="140"/>
            <col width="110"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Question type</td>
                <td colspan="2">GPT-3.5</td>
                <td>GPT-4.0</td>
                <td>GPT-4o</td>
                <td>Copilot</td>
                <td>ERNIE Bot-3.5</td>
                <td>SPARK</td>
                <td>Qwen-2.5</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="10">
                  <bold>2023</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A1</td>
                <td>0.526</td>
                <td>0.684</td>
                <td>0.789</td>
                <td>0.763</td>
                <td>0.763</td>
                <td>0.719</td>
                <td>0.860</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A2</td>
                <td>0.443</td>
                <td>0.807</td>
                <td>0.830</td>
                <td>0.784</td>
                <td>0.807</td>
                <td>0.705</td>
                <td>0.909</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A3</td>
                <td>0.647</td>
                <td>0.794</td>
                <td>0.794</td>
                <td>0.824</td>
                <td>0.824</td>
                <td>0.765</td>
                <td>0.853</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A4</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>1.000</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>2022</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A1</td>
                <td>0.476</td>
                <td>0.746</td>
                <td>0.881</td>
                <td>0.825</td>
                <td>0.817</td>
                <td>0.698</td>
                <td>0.913</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A2</td>
                <td>0.405</td>
                <td>0.582</td>
                <td>0.671</td>
                <td>0.696</td>
                <td>0.671</td>
                <td>0.620</td>
                <td>0.810</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A3</td>
                <td>0.519</td>
                <td>0.667</td>
                <td>0.889</td>
                <td>0.778</td>
                <td>0.852</td>
                <td>0.630</td>
                <td>0.963</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A4</td>
                <td>0.500</td>
                <td>0.750</td>
                <td>1.000</td>
                <td>0.875</td>
                <td>0.875</td>
                <td>0.875</td>
                <td>0.875</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>2021</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A1</td>
                <td>0.467</td>
                <td>0.660</td>
                <td>0.820</td>
                <td>0.667</td>
                <td>0.727</td>
                <td>0.600</td>
                <td>0.927</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A2</td>
                <td>0.538</td>
                <td>0.738</td>
                <td>0.846</td>
                <td>0.738</td>
                <td>0.831</td>
                <td>0.646</td>
                <td>0.938</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A3</td>
                <td>0.524</td>
                <td>0.762</td>
                <td>0.857</td>
                <td>0.714</td>
                <td>0.857</td>
                <td>0.571</td>
                <td>0.810</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A4</td>
                <td>0.500</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>1.000</td>
                <td>0.750</td>
                <td>0.500</td>
                <td>1.000</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>2020</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A1</td>
                <td>0.478</td>
                <td>0.675</td>
                <td>0.771</td>
                <td>0.650</td>
                <td>0.771</td>
                <td>0.656</td>
                <td>0.879</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A2</td>
                <td>0.492</td>
                <td>0.695</td>
                <td>0.763</td>
                <td>0.695</td>
                <td>0.814</td>
                <td>0.678</td>
                <td>0.915</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A3</td>
                <td>0.550</td>
                <td>0.600</td>
                <td>0.550</td>
                <td>0.650</td>
                <td>0.750</td>
                <td>0.600</td>
                <td>0.900</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A4</td>
                <td>0.500</td>
                <td>0.750</td>
                <td>1.000</td>
                <td>0.750</td>
                <td>1.000</td>
                <td>0.250</td>
                <td>1.000</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>2019</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A1</td>
                <td>0.490</td>
                <td>0.680</td>
                <td>0.804</td>
                <td>0.503</td>
                <td>0.791</td>
                <td>0.601</td>
                <td>0.869</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A2</td>
                <td>0.569</td>
                <td>0.745</td>
                <td>0.843</td>
                <td>0.588</td>
                <td>0.784</td>
                <td>0.588</td>
                <td>0.843</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">A3</td>
                <td>0.556</td>
                <td>0.778</td>
                <td>0.861</td>
                <td>0.500</td>
                <td>0.778</td>
                <td>0.583</td>
                <td>0.917</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Overall</td>
                <td colspan="2">0.495</td>
                <td>0.703</td>
                <td>0.807</td>
                <td>0.688</td>
                <td>0.781</td>
                <td>0.650</td>
                <td>0.889</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Correlation Heatmap and AUC Curves Using Machine Learning</title>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> provides an analysis of the correlation heatmap and AUC curves for machine learning models. <xref rid="figure2" ref-type="fig">Figure 2</xref>A presents the correlation heatmap, where Qwen-2.5 shows the highest correlation with correct answers (<italic>r</italic>=0.859), while GPT-3.5 shows the lowest correlation (<italic>r</italic>=0.402). <xref rid="figure2" ref-type="fig">Figure 2</xref>B illustrates the AUC scores for each machine learning model in the multiclass classification task. The models achieved the following AUC scores: LR (0.946), SVM (0.980), RF (0.976), KNN (0.930), MLP (0.973), LightGBM (0.963), AdaBoost (0.962), XGBoost (0.961), and CatBoost (0.970).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Correlation heatmap and AUC curves of machine learning models in CNNLE. (A) Correlation heatmap: The heatmap illustrates the relationships between different LLMs. The lower left displays numerical correlation values, while the upper right represents correlation magnitude through circle size. Color gradients range from blue (low correlation) to red (high correlation), providing a visual summary of metric interdependencies. (B) AUC curves: The AUC curves compare the performance of various machine learning and ensemble models, highlighting their classification accuracy across the data set. AdaBoost: Adaptive Boosting; AUC: area under the curve; CatBoost: Categorical Boosting; KNN: k-nearest neighbor; LightGBM: Light Gradient-Boosting Machine; LR: Logistic Regression; MLP: Multilayer Perceptron; RF: Random Forest; SVM: Support Vector Machine; XGBoost: Extreme Gradient Boosting.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63731_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Metrics for 5-Class Classification Using Machine Learning</title>
        <p><xref ref-type="table" rid="table4">Table 4</xref> presents a comparative analysis of 9 machine learning models for multiclass classification, evaluated by average metrics including AUC, accuracy, sensitivity, specificity, precision, PPV, <italic>F</italic><sub>1</sub>-score, and NPV. Among these, the SVM and XGBoost models achieve AUC values of 0.980 and 0.961, along with accuracy scores of 0.858 and 0.908, respectively. In contrast, LR and KNN exhibit lower accuracy scores of 0.817 and 0.767.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Metrics of machine learning.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="140"/>
            <col width="100"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="120"/>
            <col width="110"/>
            <col width="110"/>
            <col width="90"/>
            <thead>
              <tr valign="bottom">
                <td>Classifier</td>
                <td>AUC<sup>a</sup></td>
                <td>Accuracy</td>
                <td>Sensitivity</td>
                <td>Specificity</td>
                <td>Precision</td>
                <td>PPV<sup>b</sup></td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>NPV<sup>c</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>LR<sup>d</sup></td>
                <td>0.946</td>
                <td>0.817</td>
                <td>0.808</td>
                <td>0.953</td>
                <td>0.818</td>
                <td>0.818</td>
                <td>0.809</td>
                <td>0.954</td>
              </tr>
              <tr valign="top">
                <td>SVM<sup>e</sup></td>
                <td>0.980</td>
                <td>0.858</td>
                <td>0.857</td>
                <td>0.965</td>
                <td>0.861</td>
                <td>0.861</td>
                <td>0.854</td>
                <td>0.964</td>
              </tr>
              <tr valign="top">
                <td>RF<sup>f</sup></td>
                <td>0.976</td>
                <td>0.858</td>
                <td>0.860</td>
                <td>0.965</td>
                <td>0.856</td>
                <td>0.856</td>
                <td>0.854</td>
                <td>0.964</td>
              </tr>
              <tr valign="top">
                <td>KNN<sup>g</sup></td>
                <td>0.930</td>
                <td>0.767</td>
                <td>0.772</td>
                <td>0.942</td>
                <td>0.787</td>
                <td>0.787</td>
                <td>0.768</td>
                <td>0.941</td>
              </tr>
              <tr valign="top">
                <td>MLP<sup>h</sup></td>
                <td>0.973</td>
                <td>0.825</td>
                <td>0.823</td>
                <td>0.957</td>
                <td>0.830</td>
                <td>0.830</td>
                <td>0.819</td>
                <td>0.956</td>
              </tr>
              <tr valign="top">
                <td>LightGBM<sup>i</sup></td>
                <td>0.963</td>
                <td>0.900</td>
                <td>0.908</td>
                <td>0.975</td>
                <td>0.895</td>
                <td>0.895</td>
                <td>0.899</td>
                <td>0.974</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost<sup>j</sup></td>
                <td>0.962</td>
                <td>0.858</td>
                <td>0.859</td>
                <td>0.964</td>
                <td>0.855</td>
                <td>0.855</td>
                <td>0.856</td>
                <td>0.964</td>
              </tr>
              <tr valign="top">
                <td>XGBoost<sup>k</sup></td>
                <td>0.961</td>
                <td>0.908</td>
                <td>0.905</td>
                <td>0.978</td>
                <td>0.901</td>
                <td>0.901</td>
                <td>0.901</td>
                <td>0.977</td>
              </tr>
              <tr valign="top">
                <td>CatBoost<sup>l</sup></td>
                <td>0.970</td>
                <td>0.892</td>
                <td>0.892</td>
                <td>0.974</td>
                <td>0.885</td>
                <td>0.885</td>
                <td>0.885</td>
                <td> 0.973</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>AUC: area under the curve.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>PPV: positive predictive value.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>NPV: negative predictive value.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>LR: Logistic Regression.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>SVM: Support Vector Machine.</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>RF: Random Forest.</p>
            </fn>
            <fn id="table4fn7">
              <p><sup>g</sup>KNN: k-nearest neighbor.</p>
            </fn>
            <fn id="table4fn8">
              <p><sup>h</sup>MLP: Multilayer Perceptron.</p>
            </fn>
            <fn id="table4fn9">
              <p><sup>i</sup>LightGBM: Light Gradient-Boosting Machine.</p>
            </fn>
            <fn id="table4fn10">
              <p><sup>j</sup>AdaBoost: Adaptive Boosting.</p>
            </fn>
            <fn id="table4fn11">
              <p><sup>k</sup>XGBoost: Extreme Gradient Boosting.</p>
            </fn>
            <fn id="table4fn12">
              <p><sup>l</sup>CatBoost: Categorical Boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Importance Ranking of SVM and XGBoost Models</title>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref> presents the SHAP summary bar plot for both the SVM and XGBoost models. In <xref rid="figure3" ref-type="fig">Figure 3</xref>A, the SVM model ranks the features as follows: Qwen-2.5, ERNIE Bot-3.5, GPT-4o, Copilot, GPT-4.0, SPARK, and GPT-3.5. Meanwhile, <xref rid="figure3" ref-type="fig">Figure 3</xref>B shows the importance ranking of XGBoost model with a slightly different order: Qwen-2.5, GPT-4o, ERNIE Bot-3.5, Copilot, GPT-3.5, SPARK, and GPT-4.0. Qwen-2.5 stands out as the most influential feature in both models. Furthermore, including other LLMs enhances overall model performance, evidenced by improvements in AUC and accuracy.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>SHAP summary bar plot in Support Vector Machine (SVM) and Extreme Gradient Boosting (XGBoost) models. (A) Importance ranking of SVM model. (B) Importance ranking of XGBoost model. SHAP: Shapley Additive Explanations.</p>
          </caption>
          <graphic xlink:href="medinform_v13i1e63731_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study is the first to evaluate the performance of 7 LLMs on the CNNLE dataset (2019-2023), highlighting significant advancements in Chinese LLM development and their applications in nursing education. Among the models tested, Qwen-2.5 demonstrated the highest accuracy (88.92%), significantly surpassing the performance of the other LLMs. This superior accuracy can be attributed to its training on an extensive Chinese dataset and optimized parameters, enabling it to handle domain-specific nursing knowledge and complex clinical decision-making tasks with exceptional precision. These results underline the growing feasibility of deploying advanced LLMs such as Qwen-2.5 to support standardized nursing examinations in China, offering consistent, scalable, and efficient assessments.</p>
        <p>Our findings offer a clear pathway for the practical application of Qwen-2.5 in nursing curricula and professional training. For instance, Qwen-2.5 could serve as a virtual tutor, providing personalized feedback and explanations to nursing students in real time. Its ability to respond promptly and accurately to a wide range of questions makes it particularly valuable for addressing individual knowledge gaps and reinforcing complex concepts. Educators could incorporate Qwen-2.5 into classroom activities, using it to simulate clinical scenarios or evaluate students’ decision-making skills. Furthermore, mobile apps powered by Qwen-2.5 could allow nursing students to access high-quality, interactive learning resources anytime and anywhere, thereby enhancing accessibility and flexibility in education. Beyond supporting student learning, Qwen-2.5 and other LLMs can enhance professional development for practicing nurses. For instance, these models could be integrated into continuing education programs, where they act as interactive resources to update practitioners on the latest evidence-based practices. By serving as a knowledge repository, LLMs can enable nurses to quickly access relevant guidelines, ensuring timely and informed clinical decisions.</p>
        <p>The results also extend prior research by demonstrating how ensemble machine learning methods can enhance LLM performance in specialized tasks. By integrating the outputs of 7 LLMs using the XGBoost algorithm, we achieved an improved accuracy of 90.83%, surpassing the best-performing single model. This novel application of ensemble methods highlights a promising direction for developing personalized LLMs tailored to specific domains, such as health care education. Previous studies, including those by Li et al [<xref ref-type="bibr" rid="ref38">38</xref>] and Brin et al [<xref ref-type="bibr" rid="ref39">39</xref>], have emphasized the value of context-specific tuning, but our research provides concrete evidence of the effectiveness of combining multiple models to enhance accuracy in domain-specific applications.</p>
        <p>Furthermore, our study situates Chinese LLMs within the broader global landscape of AI. While skepticism has persisted regarding whether Chinese LLMs can rival models developed by OpenAI or Google, our results demonstrate that Qwen-2.5 not only excels on the CNNLE assessment but also outperforms other LLMs. For example, Qwen-2.5 achieved a higher accuracy than GPT-4 (72.5%) on similar standardized tests, as reported in previous studies [<xref ref-type="bibr" rid="ref40">40</xref>]. This performance underscores the competitive edge of Chinese-developed models, particularly in addressing language-specific and cultural nuances in health care education.</p>
        <p>Our findings also reveal recent trends in nursing education, such as the increasing complexity of standardized examinations such as the CNNLE, which has transitioned from straightforward A1-type questions to more analytical A2-type clinical case scenarios. This shift reflects the growing need for nursing professionals to develop higher-order reasoning skills and apply clinical knowledge to real-world situations. Qwen-2.5’s ability to process nuanced clinical scenarios positions it as an effective tool for addressing these demands. By integrating models such as Qwen-2.5 into nursing curricula, educators can better prepare students for the complexities of modern health care through scenario-based learning and real-time feedback.</p>
        <p>Our study also demonstrates the broader impact of China’s advancements in AI, particularly through open access LLMs such as Qwen-2.5 and ERNIE Bot-3.5, which provide practical solutions for addressing regional disparities in nursing education. These models are especially valuable in regions where access to global LLMs, such as OpenAI’s GPT models, is restricted, as they deliver high-quality, localized educational content. By promoting the standardization of nursing education across institutions, these tools help bridge resource gaps and improve the overall quality of health care training in China. Furthermore, their ability to enable personalized and flexible learning through mobile apps empowers students to access educational resources anytime and anywhere. This adaptability positions Chinese LLMs as critical tools for advancing specialized education while addressing both regional inequalities and broader challenges in health care training.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>First, our evaluation relied on MCQs to assess the knowledge of LLMs, which may not fully capture their ability to handle open-ended or complex clinical tasks. Future studies could incorporate open-ended questions, clinical simulations, or case-based assessments to evaluate LLMs’ reasoning and decision-making capabilities more comprehensively. These methods would better reflect the unstructured and nuanced scenarios encountered in real-world clinical practice, providing a deeper understanding of how LLMs process complex clinical information. Second, the performance of LLMs can vary based on factors such as prompt design, the number of questions asked, and the context of those questions, introducing variability into results. To address this, standardized evaluation protocols should be developed to ensure consistency across benchmarking studies. Furthermore, future research could focus on refining prompt engineering techniques and optimizing model fine-tuning to improve accuracy and reliability in diverse clinical applications. These refinements could support the development of LLMs that are better suited to handling complex scenarios, such as differential diagnoses or multistep decision-making. Third, while Qwen-2.5 demonstrated highest accuracy on the CNNLE dataset, its optimization for the Chinese language and MCQ format may limit its generalizability to other medical domains and contexts. Future studies should evaluate its applicability in multilingual and open-ended settings to assess its effectiveness in tasks beyond standardized testing formats and within various health care contexts. To enhance the suitability of LLMs for specialized health care tasks, such as diagnostic reasoning and treatment planning, future research could prioritize the development of domain-specific models. This could involve fine-tuning LLMs on datasets that include detailed case histories, diagnostic pathways, and clinical protocols. Such datasets would allow the models to learn context-specific patterns and reasoning processes, equipping them to provide more accurate and relevant recommendations in clinical settings. Furthermore, fine-tuned models could be used to assist in treatment planning by integrating data from clinical guidelines, patient histories, and risk assessment tools to offer tailored suggestions for patient care. Addressing biases in LLM training is also essential for ensuring equitable decision-making across diverse patient populations. Researchers should consider incorporating fairness-aware algorithms and curated datasets that reflect demographic diversity to mitigate potential biases. Such efforts could ensure that domain-specific LLMs provide consistent and unbiased recommendations, particularly in high-stakes environments such as emergency department triage workflows. Finally, this study focused on general purpose LLMs, excluding models explicitly trained for medical tasks, such as Gemini or Claude. Preliminary findings suggest that fine-tuned medical models may achieve superior accuracy for specific applications. Future research should conduct comparative evaluations of general purpose and domain-specific LLMs to identify the optimal approach for different health care needs. These studies could also assess whether fine-tuned models are more effective in real-time clinical workflows, such as triage systems, or in supporting complex decision-making across various medical specialties.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study is the first to evaluate the performance of 7 LLMs on the CNNLE and that the integration of models via machine learning significantly boosted accuracy, reaching 90.8%. These findings demonstrate the transformative potential of LLMs in revolutionizing health care education and call for further research to refine their capabilities and expand their impact on examination preparation and professional training.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Original data set.</p>
        <media xlink:href="medinform_v13i1e63731_app1.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 68 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Ipynb notebook codes.</p>
        <media xlink:href="medinform_v13i1e63731_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 249 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AdaBoost</term>
          <def>
            <p>Adaptive Boosting</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CNNLE</term>
          <def>
            <p>Chinese National Nursing Licensing Examination</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">KNN</term>
          <def>
            <p>k-nearest neighbors</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">LightGBM</term>
          <def>
            <p>Light Gradient-Boosting Machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LR</term>
          <def>
            <p>Logistic Regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MCQ</term>
          <def>
            <p>multiple-choice question</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MLP</term>
          <def>
            <p>Multilayer Perceptron</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">NPV</term>
          <def>
            <p>negative predictive value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">PPV</term>
          <def>
            <p>positive predictive value</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">RF</term>
          <def>
            <p>Random Forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">SHAP</term>
          <def>
            <p>Shapley Additive Explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SVM</term>
          <def>
            <p>Support Vector Machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">XGBoost</term>
          <def>
            <p>Extreme Gradient Boosting</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="con">
        <p>SZ was responsible for the conceptualization, methodology, software, validation, formal analysis, investigation, resources, data curation, writing of the original draft, reviewing, editing the manuscript, and visualization. WH and ZY managed data curation and visualization. JY also handled data curation and contributed to reviewing and editing the manuscript. FZ supervised the project, contributed to reviewing and editing the manuscript, and managed project administration. All authors have read and approved the final version of the manuscript and agreed to be accountable for all aspects of the work, ensuring that any questions related to the accuracy or integrity of any part of the work are appropriately investigated and resolved.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>American Association of Critical-Care Nurses</collab>
          </person-group>
          <article-title>Safeguarding the patient and the profession: the value of critical care nurse certification</article-title>
          <source>Am J Crit Care</source>
          <year>2003</year>
          <volume>12</volume>
          <issue>2</issue>
          <fpage>154</fpage>
          <lpage>164</lpage>
          <pub-id pub-id-type="medline">12625174</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gambrill</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <source>Critical Thinking in Clinical Practice: Improving the Quality of Judgments and Decisions</source>
          <year>2006</year>
          <publisher-loc>Hoboken, NJ</publisher-loc>
          <publisher-name>John Wiley &#38; Sons</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zong</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on Chinese national medical licensing examinations: a five-year examination evaluation study for physicians, pharmacists and nurses</article-title>
          <source>BMC Med Educ</source>
          <year>2024</year>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>143</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-024-05125-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-024-05125-7</pub-id>
          <pub-id pub-id-type="medline">38355517</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-024-05125-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC10868058</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Performance of GPT-4 on Chinese nursing examination: potentials for AI-assisted nursing education using large Language models</article-title>
          <source>Nurse Educ</source>
          <year>2024</year>
          <volume>49</volume>
          <issue>6</issue>
          <fpage>E338</fpage>
          <lpage>E343</lpage>
          <pub-id pub-id-type="doi">10.1097/NNE.0000000000001679</pub-id>
          <pub-id pub-id-type="medline">38981035</pub-id>
          <pub-id pub-id-type="pii">00006223-990000000-00488</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Woodnutt</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT pass a nursing exam?</article-title>
          <source>Int J Nurs Stud</source>
          <year>2023</year>
          <volume>145</volume>
          <fpage>104522</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijnurstu.2023.104522</pub-id>
          <pub-id pub-id-type="medline">37354792</pub-id>
          <pub-id pub-id-type="pii">S0020-7489(23)00087-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Castonguay</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Farthing</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Davies</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vogelsang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kleib</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Risling</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Revolutionizing nursing education through ai integration: a reflection on the disruptive impact of ChatGPT</article-title>
          <source>Nurse Educ Today</source>
          <year>2023</year>
          <volume>129</volume>
          <fpage>105916</fpage>
          <pub-id pub-id-type="doi">10.1016/j.nedt.2023.105916</pub-id>
          <pub-id pub-id-type="medline">37515957</pub-id>
          <pub-id pub-id-type="pii">S0260-6917(23)00210-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>BS</given-names>
            </name>
          </person-group>
          <article-title>Transformation of undergraduate medical education in 2023</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <volume>330</volume>
          <issue>16</issue>
          <fpage>1521</fpage>
          <lpage>1522</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2023.16943</pub-id>
          <pub-id pub-id-type="medline">37698855</pub-id>
          <pub-id pub-id-type="pii">2809659</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Branum</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schiavenato</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT accurately answer a PICOT question? Assessing aI response to a clinical question</article-title>
          <source>Nurse Educ</source>
          <year>2023</year>
          <volume>48</volume>
          <issue>5</issue>
          <fpage>231</fpage>
          <lpage>233</lpage>
          <pub-id pub-id-type="doi">10.1097/NNE.0000000000001436</pub-id>
          <pub-id pub-id-type="medline">37130197</pub-id>
          <pub-id pub-id-type="pii">00006223-990000000-00234</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Large language models answer medical questions accurately, but can't match clinicians' knowledge</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <volume>330</volume>
          <issue>9</issue>
          <fpage>792</fpage>
          <lpage>794</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2023.14311</pub-id>
          <pub-id pub-id-type="medline">37548971</pub-id>
          <pub-id pub-id-type="pii">2808297</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alam</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Zulkipli</surname>
              <given-names>IN</given-names>
            </name>
          </person-group>
          <article-title>Integrating AI in medical education: embracing ethical usage and critical understanding</article-title>
          <source>Front Med (Lausanne)</source>
          <year>2023</year>
          <volume>10</volume>
          <fpage>1279707</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37901398"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fmed.2023.1279707</pub-id>
          <pub-id pub-id-type="medline">37901398</pub-id>
          <pub-id pub-id-type="pmcid">PMC10611520</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>How I use ChatGPT responsibly in my teaching</article-title>
          <source>Nature</source>
          <year>2023</year>
          <issue>476-4687</issue>
          <fpage>1</fpage>
          <lpage>3</lpage>
          <comment>cited 2024 Dec 31<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nature.com/articles/d41586-023-01026-9"/></comment>
          <pub-id pub-id-type="doi">10.1038/d41586-023-01026-9</pub-id>
          <pub-id pub-id-type="medline">37045954</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-01026-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yi</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>A survey on evaluation of large language models</article-title>
          <source>ACM Trans Intell Syst Technol</source>
          <year>2024</year>
          <volume>15</volume>
          <issue>3</issue>
          <fpage>1</fpage>
          <lpage>45</lpage>
          <pub-id pub-id-type="doi">10.1145/3641289</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abbasian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Khatibi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Azimi</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Oniani</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Shakeri Hossein Abad</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Thieme</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sriram</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gevaert</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rahmani</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Foundation metrics for evaluating effectiveness of healthcare conversations powered by generative AI</article-title>
          <source>NPJ Digit Med</source>
          <year>2024</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>82</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-024-01074-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-024-01074-z</pub-id>
          <pub-id pub-id-type="medline">38553625</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-024-01074-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC10980701</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Scales</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tanwani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Payne</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Seneviratne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gamble</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Babiker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schärli</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Agüera Y Arcas</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Large language models encode clinical knowledge</article-title>
          <source>Nature</source>
          <year>2023</year>
          <volume>620</volume>
          <issue>7972</issue>
          <fpage>172</fpage>
          <lpage>180</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37438534"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="medline">37438534</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10396962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSJ</given-names>
            </name>
            <name name-style="western">
              <surname>Elangovan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gutierrez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medicine</article-title>
          <source>Nat Med</source>
          <year>2023</year>
          <volume>29</volume>
          <issue>8</issue>
          <fpage>1930</fpage>
          <lpage>1940</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>
          <pub-id pub-id-type="medline">37460753</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-023-02448-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moura</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>DT</given-names>
            </name>
            <name name-style="western">
              <surname>Sheikh</surname>
              <given-names>IS</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kalfin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kummer</surname>
              <given-names>BR</given-names>
            </name>
            <name name-style="western">
              <surname>Weathers</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Grinspan</surname>
              <given-names>ZM</given-names>
            </name>
            <name name-style="western">
              <surname>Silsbee</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>LK</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>AD</given-names>
            </name>
          </person-group>
          <article-title>Implications of large language models for quality and efficiency of neurologic care: emerging issues in neurology</article-title>
          <source>Neurology</source>
          <year>2024</year>
          <volume>102</volume>
          <issue>11</issue>
          <fpage>e209497</fpage>
          <pub-id pub-id-type="doi">10.1212/WNL.0000000000209497</pub-id>
          <pub-id pub-id-type="medline">38759131</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kernberg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gold</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Mohan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Using ChatGPT-4 to create structured medical notes from audio recordings of physician-patient encounters: comparative study</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e54419</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e54419/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/54419</pub-id>
          <pub-id pub-id-type="medline">38648636</pub-id>
          <pub-id pub-id-type="pii">v26i1e54419</pub-id>
          <pub-id pub-id-type="pmcid">PMC11074889</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Caruccio</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cirillo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Polese</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Solimando</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sundaramurthy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tortora</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT provide intelligent diagnoses? A comparative study between predictive models and ChatGPT to define a new medical diagnostic bot</article-title>
          <source>Expert Syst Appl</source>
          <year>2024</year>
          <volume>235</volume>
          <fpage>121186</fpage>
          <pub-id pub-id-type="doi">10.1016/j.eswa.2023.121186</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ferdush</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Begum</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hossain</surname>
              <given-names>ST</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and clinical decision support: scope, application, and limitations</article-title>
          <source>Ann Biomed Eng</source>
          <year>2024</year>
          <volume>52</volume>
          <issue>5</issue>
          <fpage>1119</fpage>
          <lpage>1124</lpage>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03329-4</pub-id>
          <pub-id pub-id-type="medline">37516680</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03329-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Su</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>LE</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>YC</given-names>
            </name>
          </person-group>
          <article-title>Assessing question characteristic influences on ChatGPT's performance and response-explanation consistency: insights from taiwan's nursing licensing exam</article-title>
          <source>Int J Nurs Stud</source>
          <year>2024</year>
          <volume>153</volume>
          <fpage>104717</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijnurstu.2024.104717</pub-id>
          <pub-id pub-id-type="medline">38401366</pub-id>
          <pub-id pub-id-type="pii">S0020-7489(24)00029-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pham</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Govender</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tehami</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chavez</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Adepoju</surname>
              <given-names>OE</given-names>
            </name>
            <name name-style="western">
              <surname>Liaw</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT's performance in cardiac arrest and bradycardia simulations using the American Heart Association's advanced cardiovascular life support guidelines: exploratory study</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e55037</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e55037/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/55037</pub-id>
          <pub-id pub-id-type="medline">38648098</pub-id>
          <pub-id pub-id-type="pii">v26i1e55037</pub-id>
          <pub-id pub-id-type="pmcid">PMC11074885</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Burk-Rafel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reinstein</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Cocks</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Marin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aphinyanaphongs</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Development and validation of a machine learning-based decision support tool for residency applicant screening and review</article-title>
          <source>Acad Med</source>
          <year>2021</year>
          <volume>96</volume>
          <issue>11S</issue>
          <fpage>S54</fpage>
          <lpage>S61</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000004317</pub-id>
          <pub-id pub-id-type="medline">34348383</pub-id>
          <pub-id pub-id-type="pii">00001888-202111001-00013</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Qu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Enhancement of the performance of large language models in diabetes education through retrieval-augmented generation: comparative study</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e58041</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e58041/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/58041</pub-id>
          <pub-id pub-id-type="medline">39046096</pub-id>
          <pub-id pub-id-type="pii">v26i1e58041</pub-id>
          <pub-id pub-id-type="pmcid">PMC11584532</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Exploring the clinical capabilities and limitations of ChatGPT: a cautionary tale for medical applications</article-title>
          <source>Int J Surg</source>
          <year>2023</year>
          <volume>109</volume>
          <issue>9</issue>
          <fpage>2865</fpage>
          <lpage>2867</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37222684"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JS9.0000000000000523</pub-id>
          <pub-id pub-id-type="medline">37222684</pub-id>
          <pub-id pub-id-type="pii">01279778-990000000-00387</pub-id>
          <pub-id pub-id-type="pmcid">PMC10498888</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fauss</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Detecting ChatGPT-generated essays in a large-scale writing assessment: is there a bias against non-native english speakers?</article-title>
          <source>Comput Educ</source>
          <year>2024</year>
          <volume>217</volume>
          <fpage>105070</fpage>
          <pub-id pub-id-type="doi">10.1016/j.compedu.2024.105070</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zack</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Suzgun</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Gichoya</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jurafsky</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bates</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Abdulnour</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Butte</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Assessing the potential of GPT-4 to perpetuate racial and gender biases in health care: a model evaluation study</article-title>
          <source>Lancet Digit Health</source>
          <year>2024</year>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>e12</fpage>
          <lpage>e22</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(23)00225-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(23)00225-X</pub-id>
          <pub-id pub-id-type="medline">38123252</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(23)00225-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sandmann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Riepenhausen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Plagwitz</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Varghese</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Systematic analysis of ChatGPT, Google search and llama 2 for clinical decision support tasks</article-title>
          <source>Nat Commun</source>
          <year>2024</year>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>2050</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41467-024-46411-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41467-024-46411-8</pub-id>
          <pub-id pub-id-type="medline">38448475</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41467-024-46411-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC10917796</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sabharwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Comparison of RN licensure examination: China and the United States</article-title>
          <source>Int J Nurs Sci</source>
          <year>2019</year>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>111</fpage>
          <lpage>116</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2352-0132(18)30022-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijnss.2018.11.002</pub-id>
          <pub-id pub-id-type="medline">31406876</pub-id>
          <pub-id pub-id-type="pii">S2352-0132(18)30022-X</pub-id>
          <pub-id pub-id-type="pmcid">PMC6608795</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nusinovici</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tham</surname>
              <given-names>YC</given-names>
            </name>
            <name name-style="western">
              <surname>Chak Yan</surname>
              <given-names>MY</given-names>
            </name>
            <name name-style="western">
              <surname>Wei Ting</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sabanayagam</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>TY</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Logistic regression was as good as machine learning for predicting major chronic diseases</article-title>
          <source>J Clin Epidemiol</source>
          <year>2020</year>
          <volume>122</volume>
          <fpage>56</fpage>
          <lpage>69</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.03.002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Suthaharan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Support vector machine</article-title>
          <source>Machine Learning Models and Algorithms for Big Data Classification: Thinking With Examples for Effective Learning</source>
          <year>2016</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>207</fpage>
          <lpage>235</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>GB</given-names>
            </name>
          </person-group>
          <article-title>Extreme learning machine for multilayer perceptron</article-title>
          <source>IEEE Trans Neural Netw Learn Syst</source>
          <year>2016</year>
          <volume>27</volume>
          <issue>4</issue>
          <fpage>809</fpage>
          <lpage>821</lpage>
          <pub-id pub-id-type="doi">10.1109/TNNLS.2015.2424995</pub-id>
          <pub-id pub-id-type="medline">25966483</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Samet</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>K-nearest neighbor finding using MaxNearestDist</article-title>
          <source>IEEE Trans Pattern Anal Mach Intell</source>
          <year>2008</year>
          <volume>30</volume>
          <issue>2</issue>
          <fpage>243</fpage>
          <lpage>252</lpage>
          <pub-id pub-id-type="doi">10.1109/TPAMI.2007.1182</pub-id>
          <pub-id pub-id-type="medline">18084056</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Segal</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <source>Machine learning benchmarks and random forest regression</source>
          <year>2004</year>
          <access-date>2004-04-14</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://escholarship.org/uc/item/35x3v9t4">https://escholarship.org/uc/item/35x3v9t4</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shehadeh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Alshboul</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Al Mamlook</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Hamedat</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Machine learning models for predicting the residual value of heavy construction equipment: an evaluation of modified decision tree, LightGBM, and XGBoost regression</article-title>
          <source>Automation Constr</source>
          <year>2021</year>
          <volume>129</volume>
          <fpage>103827</fpage>
          <pub-id pub-id-type="doi">10.1016/j.autcon.2021.103827</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>ZX</given-names>
            </name>
          </person-group>
          <article-title>A novel adaBoost framework with robust threshold and structural optimization</article-title>
          <source>IEEE Trans Cybern</source>
          <year>2018</year>
          <volume>48</volume>
          <issue>1</issue>
          <fpage>64</fpage>
          <lpage>76</lpage>
          <pub-id pub-id-type="doi">10.1109/TCYB.2016.2623900</pub-id>
          <pub-id pub-id-type="medline">27898387</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A scalable tree boosting system</article-title>
          <year>2016</year>
          <conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and data Mining</conf-name>
          <conf-date>2016 August 17</conf-date>
          <conf-loc>San Francisco, CA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hancock</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Khoshgoftaar</surname>
              <given-names>TM</given-names>
            </name>
          </person-group>
          <article-title>CatBoost for big data: an interdisciplinary review</article-title>
          <source>J Big Data</source>
          <year>2020</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>94</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33169094"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s40537-020-00369-8</pub-id>
          <pub-id pub-id-type="medline">33169094</pub-id>
          <pub-id pub-id-type="pii">369</pub-id>
          <pub-id pub-id-type="pmcid">PMC7610170</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kao</surname>
              <given-names>YC</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>YM</given-names>
            </name>
            <name name-style="western">
              <surname>Yeh</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>TW</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>KP</given-names>
            </name>
          </person-group>
          <article-title>Comparing the performance of ChatGPT GPT-4, Bard, and llama-2 in the Taiwan Psychiatric Licensing Examination and in differential diagnosis with multi-center psychiatrists</article-title>
          <source>Psychiatry Clin Neurosci</source>
          <year>2024</year>
          <volume>78</volume>
          <issue>6</issue>
          <fpage>347</fpage>
          <lpage>352</lpage>
          <pub-id pub-id-type="doi">10.1111/pcn.13656</pub-id>
          <pub-id pub-id-type="medline">38404249</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sorin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vaid</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Soroush</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Glicksberg</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Charney</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Nadkarni</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Klang</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>16492</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-43436-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-43436-9</pub-id>
          <pub-id pub-id-type="medline">37779171</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-43436-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC10543445</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sato</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ogasawara</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT (GPT-4) passed the Japanese national license examination for pharmacists in 2022, answering all items including those with diagrams: a descriptive study</article-title>
          <source>J Educ Eval Health Prof</source>
          <year>2024</year>
          <volume>21</volume>
          <fpage>4</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38413129"/>
          </comment>
          <pub-id pub-id-type="doi">10.3352/jeehp.2024.21.4</pub-id>
          <pub-id pub-id-type="medline">38413129</pub-id>
          <pub-id pub-id-type="pii">jeehp.2024.21.4</pub-id>
          <pub-id pub-id-type="pmcid">PMC10948916</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
