<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e73941</article-id><article-id pub-id-type="doi">10.2196/73941</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Application of Large Language Models in Complex Clinical Cases: Cross-Sectional Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Huang</surname><given-names>Yuanheng</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Yang</surname><given-names>Guozhen</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Shen</surname><given-names>Yahui</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Huiguo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Weibin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Xiaojun</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Yonghui</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Kai</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Jiannan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Zhang</surname><given-names>Jian</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Cardiothoracic Surgery, Third Affiliated Hospital of Sun Yat-sen University</institution><addr-line>2693 Kaichuang Avenue, Huangpu District</addr-line><addr-line>Guangzhou</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Gynecologic Oncology, Shaanxi Provincial Cancer Hospital</institution><addr-line>Xi'an</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Adeoye</surname><given-names>Adekunle</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Jin</surname><given-names>Li Yuan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to  Jian Zhang, MD, Department of Cardiothoracic Surgery, Third Affiliated Hospital of Sun Yat-sen University, 2693 Kaichuang Avenue, Huangpu District, Guangzhou, 510000, China, 86 13922192727, 86 82179042; <email>sumszhangjian@163.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>14</day><month>8</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e73941</elocation-id><history><date date-type="received"><day>14</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>30</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>10</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Yuanheng Huang, Guozhen Yang, Yahui Shen, Huiguo Chen, Weibin Wu, Xiaojun Li, Yonghui Wu, Kai Zhang, Jiannan Xu, Jian Zhang. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 14.8.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e73941"/><abstract><sec><title>Background</title><p>Large language models (LLMs) have made significant advancements in natural language processing (NLP) and are gradually showing potential for application in the medical field. However, LLMs still face challenges in medicine.</p></sec><sec><title>Objective</title><p>This study aims to evaluate the efficiency, accuracy, and cost of LLMs in handling complex medical cases and to assess their potential and applicability as tools for clinical decision support.</p></sec><sec sec-type="methods"><title>Methods</title><p>We selected cases from the database of the Department of Cardiothoracic Surgery, the Third Affiliated Hospital of Sun Yat-sen University (2021&#x2010;2024), and conducted a multidimensional preliminary evaluation of the latest LLMs in clinical decision-making for complex cases. The evaluation included measuring the time taken for the LLMs to generate decision recommendations, Likert scores, and calculating decision costs to assess the execution efficiency, accuracy, and cost-effectiveness of the models.</p></sec><sec sec-type="results"><title>Results</title><p>A total of 80 complex cases were included in this study, and the performance of multiple LLMs in clinical decision-making was evaluated. Experts required 33.60 minutes on average (95% CI 32.57&#x2010;34.63), far longer than any LLM. GPTo1 (0.71, 95% CI 0.67&#x2010;0.74), GPT4o (0.88, 95% CI 0.83&#x2010;0.92), and Deepseek (0.94, 95% CI 0.90&#x2010;0.96) all finished under a minute without statistical differences. Although Kimi, Gemini, LLaMa3-8B, and LLaMa3-70B took 1.02&#x2010;3.20 minutes, they were still faster than experts. In terms of decision accuracy, Deepseek-R1 had the highest accuracy (mean Likert score=4.19), with no significant difference compared to GPTo1 (<italic>P</italic>=.699), and both performed significantly better than GPT4o, Kimi, Gemini, LLaMa3-70B, and LLaMa3-8B (<italic>P</italic>&#x003C;.001). Deepseek-R1 and GPTo1 demonstrated the lowest hallucination rates&#x2014;6/80 (8%) and 5/80 (6%), respectively&#x2014;significantly outperforming GPT-4o (7/80, 9%), Kimi (10/80, 12%), and the Gemini and LLaMa3 models, which exhibited substantially higher rates ranging from 13/80 (16%) to 25/80 (31%). Regarding decision costs, all LLMs showed significantly lower costs than the Multidisciplinary Team<bold>,</bold> with open-source models such as Deepseek-R1 offering a zero direct cost advantage.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPTo1 and Deepseek-R1 show strong clinical potential, boosting efficiency, maintaining accuracy, and reducing costs. GPT4o and Kimi performed moderately, indicating suitability for broader clinical tasks. Further research is needed to validate LLaMa3 series and Gemini in clinical decision.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>artificial intelligence</kwd><kwd>clinical decision support</kwd><kwd>complex medical cases</kwd><kwd>cross-sectional studies</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>With the development of artificial intelligence (AI) and deep learning technologies, large language models (LLMs) have demonstrated remarkable potential across various fields, particularly in natural language processing (NLP) tasks such as summarization, paraphrasing, generating new text content, and writing program code [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Additionally, LLMs can act as personal assistants, helping users answer a wide range of questions.</p><p>The primary goal of LLMs was not initially to serve the medical field, but some studies have already shown that LLMs have significant potential in medicine [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. For example, research has found that ChatGPT can pass the United States Medical Licensing Examination (USMLE) [<xref ref-type="bibr" rid="ref5">5</xref>] and the Advanced Cardiovascular Life Support (ACLS) exam [<xref ref-type="bibr" rid="ref6">6</xref>]. In addition to exam simulations, ChatGPT&#x2019;s potential advantages in daily medical applications have also been confirmed, such as extracting information from electronic health records and assisting with literature searches [<xref ref-type="bibr" rid="ref7">7</xref>]. However, the training knowledge base of LLMs may have issues such as incompleteness, information bias, or generation of misleading information, and the application of general-purpose LLMs in medicine has not yet undergone large-scale clinical studies. Therefore, it is essential to explore the application of LLMs in medicine in greater depth.</p><p>Previous studies have verified the diagnostic capabilities of GPT-3.5, GPT-4, and other LLMs [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref12">12</xref>], such as its ability to generate detailed differential diagnosis lists for common clinical cases. Additionally, research teams tested the GPT-4 ability to make accurate diagnoses from medical records [<xref ref-type="bibr" rid="ref9">9</xref>], and the results showed that the generative AI GPT-4 chose the correct diagnosis as the primary diagnosis in nearly 40% of cases and provided the correct potential diagnosis in 64% of challenging cases. However, complex cases are common in hospitals, especially in cardiothoracic surgery. Due to population aging and insufficient medical resources, providing timely and high-quality clinical decision support has become a major challenge for health care systems worldwide. Delivering accurate and effective clinical decisions for complex cases typically requires substantial human, time, and money. Although some studies are currently exploring the application of LLMs in medicine, a recent review [<xref ref-type="bibr" rid="ref13">13</xref>] published in a <italic>Journal of the American Medical Association</italic> subjournal of 519 studies assessing LLMs in the medical field found that only 5% of studies used real-world data. This exposes the limitations of current research and the potential disconnect from clinical reality. In this context, whether LLMs can provide effective clinical decision support for doctors in complex cases still requires further investigation.</p></sec><sec id="s1-2"><title>Objectives</title><p>Therefore, this study aimed to preliminarily evaluate the utility of LLMs in complex medical decision-making by assessing their efficiency, accuracy, and cost in real-world clinical cases, and exploring their practical potential in clinical decision support.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Data Source</title><p>To ensure that the study is based on representative and unbiased real-world cases, we screened a total of 140 cases from the complex case database of the Department of Cardiothoracic Surgery of the Third Affiliated Hospital of Sun Yat-sen University (2021&#x2010;2024). These cases are sourced from the complex case database of a tertiary hospital, which is not publicly available, ensuring the accuracy of the study to the greatest extent and preventing any generic LLMs from being exposed to relevant data during their training, thus ensuring the objectivity and fairness of the evaluation. Most of the cases have a consistent structure, including medical history, current symptoms, examination methods and findings, diagnosis, and clinical decision. All cases were reviewed by an experienced physician, and only those that met the following criteria were included in the further analysis: the clinical case data records the entire diagnostic and treatment process, complex clinical case, and the cases are not duplicates. In this study, a complex case was defined as a patient scenario that disease involved at least 2 organ systems, required multidisciplinary input for diagnosis or treatment, and presented with conflicting or uncertain therapeutic pathways. These criteria were established by 2 senior cardiothoracic specialists.</p></sec><sec id="s2-2"><title>LLMs Evaluation</title><p>To evaluate the recently launched LLMs, we selected the following models for benchmarking: Deepseek-R1, GPTo1, GPT-4o, Kimi, Gemini, LLaMA3-70B, and LLaMA3-8B. Model setup and prompting strategy details are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The reviewers included 2 board-certified cardiothoracic surgeons, each with more than 10 years of clinical experience and active participation in weekly Multidisciplinary Team (MDT) discussions. Reviewers were blinded to the identity of the LLM models generating each response to minimize bias.</p><sec id="s2-2-1"><title>Clinical Decision Efficiency Evaluation</title><p>We evaluate the execution efficiency of Deepseek-R1, GPTo1, GPT4o, Kimi, Gemini, LLaMA3-70B, and LLaMA3-8B in clinical decision-making tasks. This is done by recording the time taken by each model from receiving the instructions to generating a complete decision recommendation and comparing it with the time taken by several experts to complete the same task.</p></sec><sec id="s2-2-2"><title>Clinical Decision Accuracy Evaluation</title><p>Two independent clinical experts were invited to evaluate the clinical decision outputs generated by Deepseek-R1, GPTo1, GPT-4o, Kimi, Gemini, LLaMA3-70B, and LLaMA3-8B. The evaluations were based on the consistency with expert decisions, using a 5-point modified Likert scale. The scoring standards and the scale are presented in <xref ref-type="table" rid="table1">Table 1</xref> [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Scoring standards for the 5-point modified Likert scale.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Likert score</td><td align="left" valign="bottom">Relevance of decisions</td><td align="left" valign="bottom">Redundancy of decisions</td></tr></thead><tbody><tr><td align="char" char="." valign="top">1</td><td align="left" valign="top">Most of all relevant decisions were not mentioned.</td><td align="left" valign="top">All or most decisions were redundant or unjustified.</td></tr><tr><td align="char" char="." valign="top">2</td><td align="left" valign="top">Some or many relevant decisions were not mentioned.</td><td align="left" valign="top">Some decisions were redundant or unjustified.</td></tr><tr><td align="char" char="." valign="top">3</td><td align="left" valign="top">Most relevant decisions were mentioned.</td><td align="left" valign="top">Some decisions were redundant or unjustified.</td></tr><tr><td align="char" char="." valign="top">4</td><td align="left" valign="top">Most relevant decisions were mentioned.</td><td align="left" valign="top">Few decisions were redundant or unjustified.</td></tr><tr><td align="char" char="." valign="top">5</td><td align="left" valign="top">All relevant decisions were mentioned.</td><td align="left" valign="top">No redundant or unjustified decisions.</td></tr></tbody></table></table-wrap></sec></sec><sec id="s2-3"><title>Hallucination Evaluation</title><p>Each LLM output was reviewed by two independent clinical experts to identify statements that were factually incorrect, clinically implausible, or not supported by the case information. A hallucination was defined as a recommendation or rationale inconsistent with current clinical guidelines or contradicting the provided case data. The hallucination rate was calculated as the proportion of cases where at least one hallucinated item was present in the model&#x2019;s response.</p><p>Each expert independently scored the output of each model, and the final score was the average of the two experts&#x2019; ratings. To ensure scoring consistency, we calculated the inter-rater reliability (Cohen kappa coefficient&#x2014;a statistic that measures inter-rater agreement for categorical items while accounting for chance agreement) [<xref ref-type="bibr" rid="ref16">16</xref>] to validate the reliability of the evaluation.</p></sec><sec id="s2-4"><title>Clinical Decision Cost Evaluation</title><p>The usage costs of each LLM were calculated and compared with the costs of the MDT. The MDT costs were calculated based on the hospital&#x2019;s MDT fee.</p></sec><sec id="s2-5"><title>Statistical Analysis</title><p>Patient baseline characteristics were described using frequencies and percentages for categorical variables, and medians or means for continuous variables. For continuous variables with approximate normal distribution, Student <italic>t</italic> test (2-tailed) was used for comparisons; for nonnormally distributed continuous variables, the Mann-Whitney <italic>U</italic> test was used for comparisons between two groups, and the Kruskal-Wallis test was used for comparisons between three or more independent samples. For categorical data, Fisher exact test, chi-square test, or Wilcoxon signed-rank test was used. All statistical tests were 2-tailed, and a <italic>P</italic> value&#x003C;.05 was considered statistically significant. Statistical analysis and plotting were performed using Python 3.13.1, with libraries including scipy.stats, pandas, seaborn, and matplotlib.</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study was approved by the ethics committee of the Third Affiliated Hospital of Sun Yat-sen University (approval ID II2025-257-03; approval date March 2025). Written informed consent was obtained from all participants. To protect privacy, personal identifiers were removed or coded before analysis, and all data were stored on an encrypted, password-protected server accessible only to the research team. Participants did not receive any financial or material compensation. All procedures involving human participants were conducted in accordance with the Declaration of Helsinki and relevant institutional guidelines.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Baseline Characteristics</title><p>A total of 140 patients from the Cardiothoracic Surgery Department of the Third Affiliated Hospital of Sun Yat-sen University were initially included in the study. After excluding 60 patients who did not meet the inclusion criteria, 80 patients remained. The median age of the study population was 60 (IQR 14&#x2010;79) years, with 58 (72%) males and 22 (28%) females. The most common complex conditions were thoracic tumors combined with respiratory system disorders (25/80 cases, 31%), followed by thoracic tumors combined with circulatory system disorders (19/80 cases, 24%). As shown in <xref ref-type="table" rid="table2">Table 2</xref>, other conditions accounted for the remaining 45% (36/80 cases), and the median number of experts involved in the MDT was 7 (range 5&#x2010;9). Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> illustrates 2 representative complex clinical cases, including the final expert consensus from MDT discussions and the corresponding responses generated by LLMs.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Baseline characteristics of the study cohort.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Values (N=80)</td></tr></thead><tbody><tr><td align="left" valign="top">Age (years), median (range)</td><td align="left" valign="top">60 (14&#x2010;79)</td></tr><tr><td align="left" valign="top">Sex, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">58 (72)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">22 (28)</td></tr><tr><td align="left" valign="top">Complex conditions, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Thoracic tumors+ respiratory disorders</td><td align="left" valign="top">25 (31)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Thoracic tumors+ circulatory disorders</td><td align="left" valign="top">19 (24)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other conditions</td><td align="left" valign="top">36 (45)</td></tr><tr><td align="left" valign="top">Number of experts in MDT<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, median (range)</td><td align="left" valign="top">7 (5-9)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>MDT: Multidisciplinary Team.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Clinical Decision Efficiency Evaluation</title><p>The data analysis revealed that the average decision-making time for the expert group was significantly longer than for all LLMs (<italic>P</italic>&#x003C;.001), with a mean of 33.6 (95% CI 32.57&#x2010;34.63) minutes. In contrast, the decision-making time for LLMs was notably shorter. GPTo1 demonstrated the best decision-making efficiency, with an average time of only 0.71 (95% CI 0.67&#x2010;0.74) minutes, followed by GPT4o (0.88, 95% CI 0.83&#x2010;0.92 minutes) and Deepseek (0.94, 95% CI 0.90&#x2010;0.96 minutes), all of which completed decisions in under 1 minute. Kimi, Gemini, and LLaMa3-8B also exhibited relatively fast decision-making abilities (1.02&#x2010;1.18 minutes), although slightly slower than GPTo1 and GPT4o. LLaMa3-70B had the longest decision time at 3.20 (95% CI 3.04&#x2010;3.37) minutes, but it was still significantly better than the expert group&#x2019;s 33.6 minutes. These results indicate that AI-driven LLMs significantly enhance the efficiency of clinical decision-making, as illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Clinical decision efficiency comparison between experts and large language models. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73941_fig01.png"/></fig></sec><sec id="s3-3"><title>Clinical Decision Accuracy Evaluation</title><p>According to the classification standards of Landis et al [<xref ref-type="bibr" rid="ref16">16</xref>], which are based on Cohen kappa coefficient, the results indicate strong agreement between two senior independent clinical experts, with kappa values ranging from 0.66 to 0.85. This suggests a high level of consistency in model evaluation. Detailed results are presented in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><p>As shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, pairwise comparisons in clinical decision accuracy assessment revealed that Deepseek-R1 achieved the highest accuracy, with a mean Likert score of 4.19 (95% CI 4.02&#x2010;4.35), significantly outperforming GPT4o, Kimi, Gemini, LLaMA3-70B, and LLaMA3-8B (all <italic>P</italic>&#x003C;.001). However, no significant difference was found between Deepseek-R1 and GPTo1 (mean 4.15, 95% CI 3.99&#x2010;4.31; <italic>P</italic>=.70). GPTo1 performed comparably to Deepseek-R1 and significantly better than GPT4o (<italic>P</italic>=.003), Kimi, Gemini, LLaMa3-70B, and LLaMa3-8B (all <italic>P</italic>&#x003C;.001). Both Deepseek-R1 and GPTo1 exhibited the highest Likert scores, approaching expert-level performance, indicating suitability for clinical decision tasks.</p><p>GPT-4o achieved moderate performance with a mean Likert score of 3.76 (95% CI 3.57&#x2010;3.95) but still significantly outperformed Kimi (<italic>P</italic>=.02), Gemini, LLaMa3-70B, and LLaMa3-8B (all <italic>P</italic>&#x003C;.001), suggesting adequate feasibility for clinical decision-making. Kimi (mean 3.48, 95% CI 3.31&#x2010;3.64) had higher scores compared to Gemini, LLaMa3-70B, and LLaMa3-8B (all <italic>P</italic>&#x003C;.001), although it demonstrated significantly lower decision-making ability than Deepseek-R1, GPTo1, and GPT4o.</p><p>Gemini (mean 2.96, 95% CI 2.78&#x2010;3.14) showed marginally better performance compared to LLaMa3-70B (mean 2.86, 95% CI 2.71&#x2010;3.02; <italic>P</italic>=.047), but no significant difference was observed between Gemini and LLaMa3-8B (mean 2.7, 95% CI 2.54&#x2010;2.86; <italic>P</italic>=.52). The LLaMa3 series consistently showed the lowest clinical decision accuracy, significantly below all other models (all <italic>P</italic>&#x003C;.001).</p><p>We further assessed the hallucination rates of LLMs and found that Deepseek-R1 and GPTo1 had the fewest hallucinations, with 6/80 (8%) and 5/80 (6%) cases, respectively. GPT-4o and Kimi followed, with 7/80 (9%) and 10/80 (12%) cases, respectively. In contrast, Gemini and both LLaMa3 variants demonstrated significantly higher hallucination rates, ranging from 13/80 (16%) to 25/80 (31%) (<xref ref-type="fig" rid="figure4">Figure 4</xref>.</p><p>These findings demonstrate that Deepseek-R1 and GPTo1 not only achieved the highest Likert scores but also exhibited the lowest hallucination rates, underscoring their strong potential as AI-assisted tools for clinical decision-making. GPT-4o exhibited moderate yet consistent performance, supporting its applicability to more complex clinical scenarios. Kimi demonstrated intermediate performance, potentially useful in selected clinical situations but requiring further optimization. Conversely, Gemini, LLaMa3-70B, and LLaMa3-8B showed relatively poor performance, limiting their immediate clinical utility and highlighting the need for substantial improvement.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Cohen Kappa agreement between 2 senior clinical experts. LLMs: large language models.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73941_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Clinical decision accuracy of large language models based on a 5-point Likert Scale.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73941_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Comparison of hallucination rates among different large language models.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73941_fig04.png"/></fig></sec><sec id="s3-4"><title>Clinical Decision Cost Evaluation</title><p>In this study, we compared clinical decision-making costs between MDT discussions and various LLMs (<xref ref-type="fig" rid="figure5">Figure 5</xref>). Our analysis revealed that the mean cost per MDT discussion conducted by experts was approximately 1000 Chinese Yuan Renminbi (about US $140), significantly higher than the decision-making costs associated with all evaluated LLMs (<italic>P</italic>&#x003C;.001).</p><p>Within the LLM group, GPT4o and GPTo1 incurred direct costs of approximately 150 Chinese Yuan Renminbi (about US $20 for the Plus version), representing an 85% cost reduction compared to expert-led MDT discussions, thus demonstrating significant economic advantages. Additionally, Kimi and Gemini are proprietary models currently available without charge, whereas Deepseek-R1, LLaMa3-8B, and LLaMa3-70B are open-source models suitable for clinical decision support. These open-source LLMs potentially incur zero direct costs.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Cost comparison of clinical decision-making: experts versus large language models. CNY: Chinese Yuan Renminbi; LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e73941_fig05.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study provides a preliminary assessment demonstrating that LLMs can significantly enhance the efficiency of clinical decision-making in complex clinical cases, while maintaining high decision accuracy and substantially reducing costs compared to traditional MDT discussions.</p></sec><sec id="s4-2"><title>Performance and Potential Clinical Utility of LLMs</title><p>Our study results demonstrate that, compared to human experts, all LLMs significantly shortened decision-making time. The expert group had an average decision time of 33.6 (95% CI 32.57&#x2010;34.63) minutes per case and involved a median of 7 clinical experts in the MDT discussion, while the most efficient model, GPTo1, took only 0.71 minutes, followed by GPT4o (0.88 minutes) and Deepseek (0.94 minutes). Even the longest decision time, observed in LLaMa3-70B, was only 3.20 minutes (which may be related to the insufficient GPU memory of the workstation in this study), still much faster than the expert group. Previous studies [<xref ref-type="bibr" rid="ref17">17</xref>] also suggest that AI, through automating parts of the decision process, maintains accuracy in 88% of cases and provides quick, real-time feedback, thus improving efficiency and saving significant time and resources. This result highlights the potential application value of AI-assisted decision tools in high-demand clinical environments, especially in scenarios requiring rapid decision-making. Furthermore, the time differences among LLMs indicate that optimizing model architecture and inference time is critical to enhancing the real-time application capabilities of clinical decision-making.</p><p>In addition to improving decision efficiency, the reliability and accuracy of LLMs in clinical applications are equally crucial. This study found that Deepseek-R1 and GPTo1 performed the best in clinical decision accuracy, with decision levels comparable to that of human experts. However, there were still significant performance differences among the models. GPT-4o and Kimi showed moderate performance, suggesting that they may be suitable for general clinical tasks. For example, the accuracy of Gemini, LLaMa3-70B, and LLaMa3-8B was significantly lower, indicating limited applicability in critical clinical decision-making. Notably, Deepseek-R1 and GPTo1 demonstrated the lowest hallucination rates (8% and 6%, respectively), showing significantly better performance compared to GPT-4o (9%), Kimi (12%), and the Gemini and LLaMa3 models, whose rates ranged from 16% to 31%. These results underscore significant variability in content reliability among LLMs, aligning with prior studies [<xref ref-type="bibr" rid="ref18">18</xref>] that reported hallucination rates ranging from 29% to 91% across different models. Interestingly, reasoning-optimized models such as Deepseek-R1 and GPTo1 showed a marked reduction in hallucinations, suggesting they may be more suitable for high-stakes medical applications where factual accuracy is critical. Beyond quantitative metrics, our study found qualitative differences in the reasoning styles of different LLMs. For example, Deepseek-R1 and GPTo1 tended to follow structured, guideline-concordant approaches, often closely mirroring the logic used by human MDT experts. GPT-4o and Kimi occasionally generated broader differential diagnoses. In contrast, Gemini, LLaMa3-70B, and LLaMa3-8B responses included more redundant or loosely justified recommendations. These observations are preliminary and illustrative. A more comprehensive and systematic comparison of LLM reasoning will be conducted in future studies. These findings reaffirm that LLMs require rigorous validation before being applied to clinical decision-making. Our findings are consistent with recent studies by other researchers [<xref ref-type="bibr" rid="ref11">11</xref>], which suggest that LLMs can serve as auxiliary tools to provide reference advice to clinical medical experts, rather than completely replacing human experts.</p><p>Currently, MDT discussions are still in their early stages in China [<xref ref-type="bibr" rid="ref19">19</xref>]. High costs remain one of the barriers to their widespread application. Studies show that the average cost of each MDT discussion by experts is approximately 1000 Chinese Yuan Renminbi (about US $140) in a tier 3 grade A hospital in China, while the decision cost for GPT4o and GPTo1 is only around 150 Chinese yuan renminbi (Plus subscription version US $19.99), representing an 85% cost reduction. Moreover, Kimi and Gemini, as closed-source models, are currently available for free, while Deepseek-R1, LLaMa3-8B, and LLaMa3-70B are open-source models with the potential advantage of zero direct costs. This makes open-source LLMs more promising in environments with limited health care resources. Although LLMs show significant advantages in terms of economic efficiency, their clinical application still needs to consider indirect costs, such as equipment costs, model fine-tuning, and ethical oversight.</p></sec><sec id="s4-3"><title>Risks and Deployment Strategies</title><p>It is also important to address privacy and security concerns when deploying LLMs in health care settings. In this study, some open-source models were deployed locally in a closed-loop environment without internet access or external application programming interface calls, effectively minimizing the risk of patient data leakage. In contrast, proprietary models such as GPT-4o, Kimi, and Gemini required data to be submitted via HTTPS after anonymization, which, despite encryption, still introduces potential risks of data exposure during transmission. For future clinical deployment, it is essential to implement privacy-preserving strategies such as on-premise inference, secure application programming interface gateways, and strict data de-identification.</p><p>While LLMs can significantly improve decision efficiency, maintain high diagnostic accuracy, and drastically reduce costs, some experts remain concerned about the potential misuse of LLMs and the lack of supervision [<xref ref-type="bibr" rid="ref20">20</xref>]. At present, LLMs still have some imperfections, such as biases and hallucinations [<xref ref-type="bibr" rid="ref21">21</xref>], which may be related to the Transformer architecture itself [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. To address these challenges, several mitigation strategies should be considered, including deploying LLMs under expert supervision, conducting fine-tuning based on medical-specific datasets, and incorporating human-in-the-loop mechanisms to ensure clinical relevance and safety. Establishing robust ethical and governance frameworks is also imperative to support responsible and transparent deployment. In the short term, continued involvement of human experts remains critical, and the most pragmatic approach is a collaborative model in which LLMs assist clinicians rather than operate independently or autonomously.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study also has some limitations. First, our evaluation focused on LLMs&#x2019; execution efficiency, decision accuracy, and costs, without in-depth exploration of the models&#x2019; limitations, interpretability, and ethical implications. Future studies will adopt more comprehensive evaluation frameworks, such as Transparent Reporting of a multivariable prediction model for Individual Prognosis or Diagnosis-LLM extension [<xref ref-type="bibr" rid="ref24">24</xref>], to incorporate additional dimensions including reproducibility, bias, and potential harm. Second, the data used in this study were obtained from a single institution, necessitating further external validation to assess the generalizability of LLMs across diverse health care settings. Third, we did not explore the integration of LLMs into hospital information systems in this study. While LLMs demonstrate significant transformative potential in clinical decision-making, further clinical trials and real-world validation are needed before their formal adoption in practice.</p></sec><sec id="s4-5"><title>Conclusions</title><p>The study indicates that LLMs, particularly GPTo1 and Deepseek-R1, have immense potential in clinical decision-making, significantly improving efficiency, maintaining high diagnostic accuracy, and reducing costs. These models can serve as powerful auxiliary decision-making tools. GPT-4o and Kimi demonstrated moderate performance, suggesting that they may be suitable for general clinical tasks. However, the application of the LLaMa3 series and Gemini in clinical decision-making requires further investigation.</p></sec></sec></body><back><ack><p>This study was supported by the National Natural Science Foundation of China (grant number 82373186). We would like to express our deepest gratitude to all those who have provided their support and assistance in the completion of this project.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>YHH conceived the study. YHH, GZY, and YHS collected clinical data and analyzed clinical data, and wrote articles. XJL, YHW, KZ, JNX, HGC, and JZ participated in clinical data collection and large language model clinical decision evaluation.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ACLS</term><def><p>Advanced Cardiovascular Life Support</p></def></def-item><def-item><term id="abb2">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">MDT</term><def><p>Multidisciplinary Team</p></def></def-item><def-item><term id="abb5">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Y</given-names> </name></person-group><article-title>The benefits and challenges of ChatGPT: an overview</article-title><source>FCIS</source><year>2023</year><volume>2</volume><issue>2</issue><fpage>81</fpage><lpage>83</lpage><pub-id pub-id-type="doi">10.54097/fcis.v2i2.4465</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Surameery</surname><given-names>NMS</given-names> </name><name name-style="western"><surname>Shakor</surname><given-names>MY</given-names> </name></person-group><article-title>Use Chat GPT to solve programming bugs</article-title><source>IJITC</source><year>2023</year><issue>31</issue><fpage>17</fpage><lpage>22</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://journal.hmjournals.com/index.php/IJITC/issue/view/216">http://journal.hmjournals.com/index.php/IJITC/issue/view/216</ext-link></comment><pub-id pub-id-type="doi">10.55529/ijitc.31.17.22</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Castonguay</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lovis</surname><given-names>C</given-names> </name></person-group><article-title>Introducing the &#x201C;AI Language Models in Health Care&#x201D; section: actionable strategies for targeted and wide-scale deployment</article-title><source>JMIR Med Inform</source><year>2023</year><month>12</month><day>21</day><volume>11</volume><fpage>e53785</fpage><pub-id pub-id-type="doi">10.2196/53785</pub-id><pub-id pub-id-type="medline">38127431</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cho</surname><given-names>HN</given-names> </name><name name-style="western"><surname>Jun</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>YH</given-names> </name><etal/></person-group><article-title>Task-specific transformer-based language models in health care: scoping review</article-title><source>JMIR Med Inform</source><year>2024</year><month>11</month><day>18</day><volume>12</volume><fpage>e49724</fpage><pub-id pub-id-type="doi">10.2196/49724</pub-id><pub-id pub-id-type="medline">39556827</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Multimodal ChatGPT-4V for electrocardiogram interpretation: promise and limitations</article-title><source>J Med Internet Res</source><year>2024</year><month>06</month><day>26</day><volume>26</volume><fpage>e54607</fpage><pub-id pub-id-type="doi">10.2196/54607</pub-id><pub-id pub-id-type="medline">38764297</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name></person-group><article-title>Utility of ChatGPT in clinical practice</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>28</day><volume>25</volume><fpage>e48568</fpage><pub-id pub-id-type="doi">10.2196/48568</pub-id><pub-id pub-id-type="medline">37379067</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Au</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>W</given-names> </name></person-group><article-title>Correspondence of &#x201C;Evaluation of large language models in breast cancer clinical scenarios: a comparative analysis based on ChatGPT-3.5, ChatGPT-4.0, and Claude2&#x201D;</article-title><source>Int J Surg</source><year>2024</year><month>09</month><day>1</day><volume>110</volume><issue>9</issue><fpage>5865</fpage><lpage>5866</lpage><pub-id pub-id-type="doi">10.1097/JS9.0000000000001616</pub-id><pub-id pub-id-type="medline">38752481</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Goodman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Patrinely</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the accuracy and reliability of AI-generated medical responses: an evaluation of the Chat-GPT model</article-title><source>Res Sq</source><year>2023</year><month>02</month><day>28</day><fpage>rs.3.rs-2566942</fpage><pub-id pub-id-type="doi">10.21203/rs.3.rs-2566942/v1</pub-id><pub-id pub-id-type="medline">36909565</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eriksen</surname><given-names>AV</given-names> </name><name name-style="western"><surname>M&#x00F6;ller</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ryg</surname><given-names>J</given-names> </name></person-group><article-title>Use of GPT-4 to diagnose complex clinical cases</article-title><source>NEJM AI</source><year>2024</year><month>01</month><volume>1</volume><issue>1</issue><fpage>AIp2300031</fpage><pub-id pub-id-type="doi">10.1056/AIp2300031</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kamel</surname><given-names>P</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A comparative evaluation of large language model utility in neuroimaging clinical decision support</article-title><source>J Imaging Inform Med</source><year>2024</year><month>11</month><day>7</day><pub-id pub-id-type="doi">10.1007/s10278-024-01161-3</pub-id><pub-id pub-id-type="medline">39508992</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models: a systematic review</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id><pub-id pub-id-type="medline">39405325</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sandmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Riepenhausen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Plagwitz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Varghese</surname><given-names>J</given-names> </name></person-group><article-title>Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks</article-title><source>Nat Commun</source><year>2024</year><month>03</month><day>6</day><volume>15</volume><issue>1</issue><fpage>2050</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-46411-8</pub-id><pub-id pub-id-type="medline">38448475</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bond</surname><given-names>WF</given-names> </name><name name-style="western"><surname>Schwartz</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Weaver</surname><given-names>KR</given-names> </name><name name-style="western"><surname>Levick</surname><given-names>D</given-names> </name><name name-style="western"><surname>Giuliano</surname><given-names>M</given-names> </name><name name-style="western"><surname>Graber</surname><given-names>ML</given-names> </name></person-group><article-title>Differential diagnosis generators: an evaluation of currently available computer programs</article-title><source>J Gen Intern Med</source><year>2012</year><month>02</month><volume>27</volume><issue>2</issue><fpage>213</fpage><lpage>219</lpage><pub-id pub-id-type="doi">10.1007/s11606-011-1804-8</pub-id><pub-id pub-id-type="medline">21789717</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>An application of hierarchical kappa-type statistics in the assessment of majority agreement among multiple observers</article-title><source>Biometrics</source><year>1977</year><month>06</month><volume>33</volume><issue>2</issue><fpage>363</fpage><lpage>374</lpage><pub-id pub-id-type="medline">884196</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McKinney</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Sieniek</surname><given-names>M</given-names> </name><name name-style="western"><surname>Godbole</surname><given-names>V</given-names> </name><etal/></person-group><article-title>International evaluation of an AI system for breast cancer screening</article-title><source>Nature New Biol</source><year>2020</year><month>01</month><volume>577</volume><issue>7788</issue><fpage>89</fpage><lpage>94</lpage><pub-id pub-id-type="doi">10.1038/s41586-019-1799-6</pub-id><pub-id pub-id-type="medline">31894144</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chelli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Descamps</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lavou&#x00E9;</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Hallucination rates and reference accuracy of ChatGPT and Bard for systematic reviews: comparative analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>22</day><volume>26</volume><fpage>e53164</fpage><pub-id pub-id-type="doi">10.2196/53164</pub-id><pub-id pub-id-type="medline">38776130</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name></person-group><article-title>Multidisciplinary team and team oncology medicine research and development in China</article-title><source>Biosci Trends</source><year>2010</year><month>08</month><volume>4</volume><issue>4</issue><fpage>151</fpage><lpage>160</lpage><pub-id pub-id-type="medline">20811133</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Webster</surname><given-names>P</given-names> </name></person-group><article-title>Medical AI chatbots: are they safe to talk to patients?</article-title><source>Nat Med</source><year>2023</year><month>11</month><volume>29</volume><issue>11</issue><fpage>2677</fpage><lpage>2679</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02535-w</pub-id><pub-id pub-id-type="medline">37684542</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Agbareia</surname><given-names>R</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Benchmarking the confidence of large language models in answering clinical questions: cross-sectional evaluation study</article-title><source>JMIR Med Inform</source><year>2025</year><month>05</month><day>16</day><volume>13</volume><fpage>e66917</fpage><pub-id pub-id-type="doi">10.2196/66917</pub-id><pub-id pub-id-type="medline">40378406</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Trinh</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Le</surname><given-names>QV</given-names> </name><name name-style="western"><surname>He</surname><given-names>H</given-names> </name><name name-style="western"><surname>Luong</surname><given-names>T</given-names> </name></person-group><article-title>Solving olympiad geometry without human demonstrations</article-title><source>Nature New Biol</source><year>2024</year><month>01</month><volume>625</volume><issue>7995</issue><fpage>476</fpage><lpage>482</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06747-5</pub-id><pub-id pub-id-type="medline">38233616</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><source>arXiv</source><comment>Preprint posted online on 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Evaluation setup and output comparison of various large language models in complex clinical cases.</p><media xlink:href="medinform_v13i1e73941_app1.docx" xlink:title="DOCX File, 24 KB"/></supplementary-material></app-group></back></article>