<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="review-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e64963</article-id><article-id pub-id-type="doi">10.2196/64963</article-id><article-categories><subj-group subj-group-type="heading"><subject>Review</subject></subj-group></article-categories><title-group><article-title>Comparing Diagnostic Accuracy of Clinical Professionals and Large Language Models: Systematic Review and Meta-Analysis</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Shan</surname><given-names>Guxue</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Xiaonan</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Chen</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Li</given-names></name><degrees>BMed</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gu</surname><given-names>Yuanjing</given-names></name><degrees>MNS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jiang</surname><given-names>Huiping</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Shi</surname><given-names>Tingqi</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Nanjing Drum Tower Hospital Clinical College of Nanjing University of Chinese Medicine</institution><addr-line>Nanjing</addr-line><country>China</country></aff><aff id="aff2"><institution>Jiangsu Province Hospital of Chinese Medicine, Affiliated Hospital of Nanjing University of Chinese Medicine</institution><addr-line>Nanjing</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of Emergency, Nanjing Drum Tower Hospital</institution><addr-line>Nanjing</addr-line><country>China</country></aff><aff id="aff4"><institution>Department of Nursing, Nanjing Drum Tower Hospital</institution><addr-line>Nanjing</addr-line><country>China</country></aff><aff id="aff5"><institution>Department of Quality Management, Nanjing Drum Tower Hospital, Affiliated Hospital of Medical School, Nanjing University</institution><addr-line>321 Zhongshan Road, Gulou District</addr-line><addr-line>Nanjing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Castonguay</surname><given-names>Alexandre</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Jafarizadeh</surname><given-names>Ali</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Laplante</surname><given-names>Simon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Arasteh</surname><given-names>Soroosh Tayebi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Tingqi Shi, MPH, Department of Quality Management, Nanjing Drum Tower Hospital, Affiliated Hospital of Medical School, Nanjing University, 321 Zhongshan Road, Gulou District, Nanjing, 210008, China, 86 1-391-299-6998; <email>13912996998@163.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>25</day><month>4</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e64963</elocation-id><history><date date-type="received"><day>31</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>19</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>25</day><month>03</month><year>2025</year></date></history><copyright-statement>&#x00A9; Guxue Shan, Xiaonan Chen, Chen Wang, Li Liu, Yuanjing Gu, Huiping Jiang, Tingqi Shi. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 25.4.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e64963"/><abstract><sec><title>Background</title><p>With the rapid development of artificial intelligence (AI) technology, especially generative AI, large language models (LLMs) have shown great potential in the medical field. Through massive medical data training, it can understand complex medical texts and can quickly analyze medical records and provide health counseling and diagnostic advice directly, especially in rare diseases. However, no study has yet compared and extensively discussed the diagnostic performance of LLMs with that of physicians.</p></sec><sec><title>Objective</title><p>This study systematically reviewed the accuracy of LLMs in clinical diagnosis and provided reference for further clinical application.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted searches in CNKI (China National Knowledge Infrastructure), VIP Database, SinoMed, PubMed, Web of Science, Embase, and CINAHL (Cumulative Index to Nursing and Allied Health Literature) from January 1, 2017, to the present. A total of 2 reviewers independently screened the literature and extracted relevant information. The risk of bias was assessed using the Prediction Model Risk of Bias Assessment Tool (PROBAST), which evaluates both the risk of bias and the applicability of included studies.</p></sec><sec sec-type="results"><title>Results</title><p>A total of 30 studies involving 19 LLMs and a total of 4762 cases were included. The quality assessment indicated a high risk of bias in the majority of studies, primary cause is known case diagnosis. For the optimal model, the accuracy of the primary diagnosis ranged from 25% to 97.8%, while the triage accuracy ranged from 66.5% to 98%.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs have demonstrated considerable diagnostic capabilities and significant potential for application across various clinical cases. Although their accuracy still falls short of that of clinical professionals, if used cautiously, they have the potential to become one of the best intelligent assistants in the field of human health care.</p></sec></abstract><kwd-group><kwd>machine learning</kwd><kwd>ML</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>natural language processing</kwd><kwd>algorithm</kwd><kwd>model</kwd><kwd>analytics</kwd><kwd>NLP</kwd><kwd>deep learning</kwd><kwd>clinical diagnosis</kwd><kwd>diagnosis</kwd><kwd>diagnostic accuracy</kwd><kwd>accuracy</kwd><kwd>systematic review</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The Google Brain research team has consistently aimed to push the boundaries of recurrent language models and encoder-decoder architectures. In 2017, Vaswani et al [<xref ref-type="bibr" rid="ref1">1</xref>] introduced a novel and simple network architecture known as the Transformer. This architecture uses a new mechanism called &#x201C;self-attention,&#x201D; leading to significant advancements in the development and training of large language models (LLMs). These models possess advanced capabilities beyond extraction or summarization tasks and include natural language generation. Although there is no official definition of LLM, based on the literature [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>], we define LLM as a model with over a billion parameters, designed for typical artificial intelligence (AI) applications.</p><p>Accurate clinical diagnosis is essential for patient treatment outcomes and survival rates. However, even when health care professionals gather extensive information and conduct numerous observations and tests, absolute diagnostic accuracy cannot be guaranteed. Minimizing diagnostic uncertainty and making the most appropriate treatment decisions remain persistent clinical challenges [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. As of May 2024, the US Food and Drug Administration has approved 882 medical devices that use AI or machine learning assistance. By June 2024, the National Medical Products Administration of China has approved 17 AI-assisted diagnostic devices. In the era of big data in health care, the integration of AI with clinical decision support is a developing trend [<xref ref-type="bibr" rid="ref6">6</xref>]. Numerous experts and scholars have explored the application of specialized AI and software tools in clinical diagnosis, yet there is limited knowledge about the performance of LLMs in this context. Therefore, this study aims to comprehensively evaluate the performance and accuracy of LLMs in clinical diagnosis, providing references for their clinical application.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This systematic review was conducted following the Preferred Reporting Items for Systematic Reviews and Meta-Analysis of Diagnostic Test Accuracy Studies (PRISMA-DTA) statement [<xref ref-type="bibr" rid="ref7">7</xref>]. Specific details can be found in <xref ref-type="supplementary-material" rid="app4">Checklist 1</xref>.</p></sec><sec id="s2-2"><title>Data Sources</title><p>A computer-assisted literature search of PubMed, Web of Science, Embase, CINAHL (Cumulative Index to Nursing and Allied Health Literature), CNKI (China National Knowledge Infrastructure), VIP, and SinoMed databases was performed from January 1, 2017, to the present. Search terms included controlled terms (MeSH [Medical Subject Heading] in PubMed and Emtree in Embase) and free-text terms. The following terms were used (including synonyms and closely related words) as index terms or free-text words: &#x201C;large language model,&#x201D; &#x201C;medicine,&#x201D; &#x201C;diagnosis,&#x201D; and &#x201C;accuracy.&#x201D; A search filter was applied to limit the results to humans. Only peer-reviewed cross-sectional studies and cohort studies were included. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides more details of the search strategy and study selection.</p></sec><sec id="s2-3"><title>Selection Criteria</title><p>This review included studies meeting the following criteria: (1) investigated the application of LLMs in the initial diagnosis of human cases, (2) published between January 1, 2017, and the date of the search, (3) study type was either cross-sectional or cohort, (4) a primary source, and (5) written in English or Chinese.</p><p>An article was excluded if it (1) was a nonprimary source such as theses, conference papers, etc, (2) did not compare the diagnostic accuracy of clinical professionals in relevant departments with that of LLMs, (3) did not specify the type or scale of the LLM used for diagnosis, (4) did not have LLM independently conduct clinical case diagnoses, (5) was a duplicate publication, and (6) did not provide complete data or the full text could not be obtained.</p></sec><sec id="s2-4"><title>Data Selection and Extraction</title><p>A total of 2 reviewers (GS and XC) independently reviewed the full texts of the eligible articles and extracted data. Any disagreements between the reviewers were discussed until a consensus was reached. The detailed characteristics extracted from each included study were: the first author and publication year, the country where the research was conducted, the study type, the study population, the source of cases, sample size, the LLMs used, control groups, and outcome measures.</p></sec><sec id="s2-5"><title>Quality of Evidence and Risk of Bias</title><p>Due to the significant heterogeneity often present in the design and implementation of diagnostic accuracy studies, it is crucial to carefully assess the quality of the included studies. The Prediction Model Risk of Bias Assessment Tool (PROBAST) was used to evaluate the risk of bias and applicability of all included studies [<xref ref-type="bibr" rid="ref8">8</xref>]. PROBAST assesses risk of bias across 4 domains: study participants, predictors, outcomes, and statistical analysis, while applicability is evaluated through the first 3 domains.</p><p>Given the complex structure and vast number of parameters in LLMs, they can be considered a &#x201C;black box&#x201D; to some extent, meaning that their internal workings and decision-making processes may not be entirely transparent or easily understood by humans [<xref ref-type="bibr" rid="ref6">6</xref>]. Consequently, during the quality assessment, certain signal issues were excluded as they were unrelated to generative AI models [<xref ref-type="bibr" rid="ref9">9</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Selection of Studies</title><p>A total of 2491 studies were found in the databases by 2 researchers independently following the predefined search strategies and data collection methods. An additional 12 articles were identified through reference tracing, bringing the total number of studies screened to 2503. Among these, 169 studies were read in full, resulting in 30 studies that met the inclusion criteria for synthesis. Reasons for exclusion at this stage were recorded and can be found in the flow diagram (see <xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flow diagram. Papers identified in databases, title or abstract screened, read full text, and included in the synthesis. Reasons for exclusion are listed. CINAHL: Cumulative Index to Nursing and Allied Health Literature; CNKI: China National Knowledge Infrastructure.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e64963_fig01.png"/></fig></sec><sec id="s3-2"><title>Studies Characteristics</title><p>The 30 included studies [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref39">39</xref>] were concentrated within the past 3 years, with 12 published in 2023, 16 in 2024, and 2 in 2025. These studies cover a wide range of countries, primarily from Japan, the United States, and China. A total of 4762 cases were analyzed, involving 19 LLMs. The studies predominantly focused on GPT-3.5 (n=14) and GPT-4 (n=20) versions (OpenAI), extensively applied in assessing clinical diagnostic accuracy. In contrast, fewer studies addressed Google Bard (n=3), Bing (n=3), GPT-4o (n=2), and GPT-4V (n=2). The case diagnoses encompassed various fields, including ophthalmology (n=9), internal medicine (n=6), emergency medicine (n=3), and general medicine (n=3), among others. The control groups included at least 193 clinical professionals, ranging from resident doctors to medical experts with over 30 years of clinical experience, to compare their diagnostic capabilities with those of the LLMs. All included studies used LLMs for data testing purposes only and were not used for real-time diagnosis of clinical patients. <xref ref-type="table" rid="table1">Table 1</xref> shows the basic characteristics of the included studies.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics and results of the eligible studies.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study</td><td align="left" valign="bottom">Year</td><td align="left" valign="bottom">Country</td><td align="left" valign="bottom">Study type</td><td align="left" valign="bottom">Subjects</td><td align="left" valign="bottom">Case source</td><td align="left" valign="bottom">Sample size</td><td align="left" valign="bottom">LLM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">Comparison group</td><td align="left" valign="bottom">Outcome measures</td></tr></thead><tbody><tr><td align="left" valign="top">Zhang et al [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">China</td><td align="left" valign="top">Prospective study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">26</td><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">Ophthalmologists</td><td align="left" valign="top">c<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, g<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Makhoul et al [<xref ref-type="bibr" rid="ref11">11</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Lebanon</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Otolaryngology cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">32</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">ENT<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> physicians, FM<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> specialists</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Pillai et al [<xref ref-type="bibr" rid="ref12">12</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">The United States</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Autoimmune diseases cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">40</td><td align="left" valign="top">GPT-3.5<break/>GPT-4<break/>LLaMa 2</td><td align="left" valign="top">A certified internist</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Levin et al [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Israel</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Neonatal cases</td><td align="left" valign="top">Developed by researchers</td><td align="left" valign="top">6</td><td align="left" valign="top">GPT-4<break/>Claude-2.0</td><td align="left" valign="top">Certi&#xFB01;ed neonatal nurse practitioners</td><td align="left" valign="top">c<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, g<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Lyons et al [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">The United States</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Developed by researchers</td><td align="left" valign="top">44</td><td align="left" valign="top">GPT-4<break/>Bing</td><td align="left" valign="top">Ophthalmology physicians</td><td align="left" valign="top">b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup>, d<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td></tr><tr><td align="left" valign="top">Sarangi et al [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">India</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">General cases</td><td align="left" valign="top">Developed by researchers</td><td align="left" valign="top">120</td><td align="left" valign="top">GPT-3.5<break/>Bard<break/>Bing</td><td align="left" valign="top">Radiology residents</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Pasl&#x0131; et al [<xref ref-type="bibr" rid="ref16">16</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Turkey</td><td align="left" valign="top">Prospective study</td><td align="left" valign="top">Emergency cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">758</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">The ED<sup><xref ref-type="table-fn" rid="table1fn9">i</xref></sup> triage team</td><td align="left" valign="top">d<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td></tr><tr><td align="left" valign="top">Wang et al [<xref ref-type="bibr" rid="ref17">17</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">China</td><td align="left" valign="top">Retrospective cohort study</td><td align="left" valign="top">Thyroid cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">109</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Thyroid doctors</td><td align="left" valign="top">c<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">Huang et al [<xref ref-type="bibr" rid="ref18">18</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">The United States</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">20</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Subspecialists (in glaucoma or retina)</td><td align="left" valign="top">c<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, g<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Stoneham et al [<xref ref-type="bibr" rid="ref19">19</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">UK</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">Dermatology cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">36</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">A dermatologist</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref20">20</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Japan</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Internal medicine cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">52</td><td align="left" valign="top">GPT-3.5<break/>GPT-4</td><td align="left" valign="top">GIM<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup> physicians</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Horiuchi et al [<xref ref-type="bibr" rid="ref21">21</xref>]</td><td align="left" valign="top">2025</td><td align="left" valign="top">Japan</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">Musculoskeletal cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">106</td><td align="left" valign="top">GPT-4<break/>GPT-4V</td><td align="left" valign="top">Radiologists</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Mitsuyama et al [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Japan</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">Brain tumors cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">150</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Radiologists</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref23">23</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Japan</td><td align="left" valign="top">Retrospective cohort study</td><td align="left" valign="top">Internal medicine cases</td><td align="left" valign="top">&#x2003;Published case reports and developed by researchers</td><td align="left" valign="top">82</td><td align="left" valign="top">Bard</td><td align="left" valign="top">GIM<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup> physicians</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Suh et al [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Korea</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">General cases</td><td align="left" valign="top">&#x2003;Published case reports</td><td align="left" valign="top">190</td><td align="left" valign="top">GPT-4V<break/>Gemini Pro Vision</td><td align="left" valign="top">Radiologists</td><td align="left" valign="top">b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Fraser et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">The United States</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Emergency cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">40</td><td align="left" valign="top">GPT-3.5<break/>GPT-4</td><td align="left" valign="top">ED physician</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup>, d<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td></tr><tr><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Japan</td><td align="left" valign="top">Prospective study</td><td align="left" valign="top">Internal medicine cases</td><td align="left" valign="top">Developed by researchers</td><td align="left" valign="top">30</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">GIM<sup><xref ref-type="table-fn" rid="table1fn10">j</xref></sup> physicians</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Shemer et al [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Israel</td><td align="left" valign="top">Retrospective cohort study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">63</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Ophthalmology residents and ophthalmologists</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, g<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Mohammadi et al [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Iran</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">Tibial plateau fracture cases</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">111</td><td align="left" valign="top">GPT-4<break/>GPT-4o</td><td align="left" valign="top">An ED physician and radiologist</td><td align="left" valign="top">f<sup><xref ref-type="table-fn" rid="table1fn11">k</xref></sup></td></tr><tr><td align="left" valign="top">Arslan et al [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">2025</td><td align="left" valign="top">Turkey</td><td align="left" valign="top">Prospective study</td><td align="left" valign="top">Emergency cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">468</td><td align="left" valign="top">GPT-4<break/>Copilot Pro</td><td align="left" valign="top">Triage nurses</td><td align="left" valign="top">d<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td></tr><tr><td align="left" valign="top">Rojas-Carabali et al [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Singapore</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Developed by researchers</td><td align="left" valign="top">25</td><td align="left" valign="top">GPT-3.5<break/>GPT-4</td><td align="left" valign="top">Ophthalmologists</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Kaya et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Germany</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">Myocarditis cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">396</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Radiologists</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, e<sup><xref ref-type="table-fn" rid="table1fn12">l</xref></sup></td></tr><tr><td align="left" valign="top">Delsoz et al [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">The United States</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">20</td><td align="left" valign="top">GPT-3.5<break/>GPT-4</td><td align="left" valign="top">Cornea specialists</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Ming et al [<xref ref-type="bibr" rid="ref33">33</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">China</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">104</td><td align="left" valign="top">GPT-3.5<break/>GPT-4</td><td align="left" valign="top">Ophthalmic residents</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Nakaura et al [<xref ref-type="bibr" rid="ref34">34</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Japan</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">Internal medicine cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">28</td><td align="left" valign="top">GPT-2<break/>GPT-3.5<break/>GPT-4</td><td align="left" valign="top">Radiologists</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, b<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Ito et al [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Japan</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">General cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">45</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Emergency physicians</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup>, d<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup></td></tr><tr><td align="left" valign="top">Gunes et al [<xref ref-type="bibr" rid="ref36">36</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Turkey</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">thoracic cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">124</td><td align="left" valign="top">10 LLMs including GPT-3.5/4<break/>Claude 3 Opus&#x2026;</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Delsoz et al [<xref ref-type="bibr" rid="ref37">37</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">The United States</td><td align="left" valign="top">Cross-sectional study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Published case reports</td><td align="left" valign="top">11</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Ophthalmology residents</td><td align="left" valign="top">a<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Liu et al [<xref ref-type="bibr" rid="ref38">38</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">China</td><td align="left" valign="top">Prospective study</td><td align="left" valign="top">Ophthalmology cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">1226</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Ophthalmologists</td><td align="left" valign="top">e<sup><xref ref-type="table-fn" rid="table1fn12">l</xref></sup></td></tr><tr><td align="left" valign="top">Li et al [<xref ref-type="bibr" rid="ref39">39</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">China</td><td align="left" valign="top">Retrospective study</td><td align="left" valign="top">Abdominal cases</td><td align="left" valign="top">Patient visit records</td><td align="left" valign="top">300</td><td align="left" valign="top">ERNie, 4.0<break/>Claude 3.5 Sonnet</td><td align="left" valign="top">Radiologists</td><td align="left" valign="top">c<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table1fn2"><p><sup>b</sup>Accuracy score.</p></fn><fn id="table1fn3"><p><sup>c</sup>Other auxiliary indicators (such as diagnostic completeness, diagnostic time, number of answers, etc).</p></fn><fn id="table1fn4"><p><sup>d</sup>ENT: ear, nose, and throat.</p></fn><fn id="table1fn5"><p><sup>e</sup>FM: family medicine.</p></fn><fn id="table1fn6"><p><sup>f</sup>Frequency of correct primary diagnosis (answer).</p></fn><fn id="table1fn7"><p><sup>g</sup> Frequency of correct diagnosis in a differential diagnosis list.</p></fn><fn id="table1fn8"><p><sup>h</sup>Triage accuracy.</p></fn><fn id="table1fn9"><p><sup>i</sup>ED: emergency department.</p></fn><fn id="table1fn10"><p><sup>j</sup>GIM: general internal medicine.</p></fn><fn id="table1fn11"><p><sup>k</sup>AUC: area under the curve.</p></fn><fn id="table1fn12"><p><sup>l</sup><italic>F</italic><sub>1</sub>-score</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Quality of Evidence and Risk of Bias</title><p>The included articles were evaluated using the PROBAST tool, with the results presented in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Overall, 10/30 (33.3%) studies had a low risk of bias, while 20/30 (66.6%) exhibited a high risk of bias. Regarding applicability, majority of study had low applicability concerns. Due to ethical concerns and patient privacy issues associated with the use of LLMs in clinical settings, most of the studies consist of retrospective studies with deidentified data and are limited to data testing. A total of 14 studies evaluated the diagnostic accuracy of models using small test sets. In addition, the &#x201C;black box&#x201D; nature of LLMs, whose training data are often undisclosed, complicates external evaluation and verification.</p></sec><sec id="s3-4"><title>LLM Feature Analysis</title><p>Although a total of 19 different LLMs were used in the included studies, extracting the LLM with the best diagnostic performance in studies tested with multiple large models simultaneously, we found that the optimal LLM did not belong to the GPT series in only 6 studies. In 80% (24/30) of the studies, the researchers chose to obtain and use the corresponding LLMs directly on the official website by online access, which somewhat lowered the threshold for the use of the LLMs in the medical field and made it more accessible to the public. In total, 18 of the included studies specified the date of access or version of the LLM used. Retrieval-augmented generation (RAG) is a technique that combines information retrieval and generation to enhance task performance by incorporating relevant information into LLMs [<xref ref-type="bibr" rid="ref40">40</xref>]. RAG was mentioned in 2 of the studies by further training of pretrained models specific to task datasets, and although RAG has been widely used in large model studies, it needs to be strengthened in the medical field. Specific details can be found in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Characteristics of the large language models (LLMs) in eligible studies.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study</td><td align="left" valign="bottom">Optimal LLM<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> in research</td><td align="left" valign="bottom">Issuing company</td><td align="left" valign="bottom">Access mode</td><td align="left" valign="bottom">Date accessed (version)</td><td align="left" valign="bottom">Parameter settings</td><td align="left" valign="bottom">RAG<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Zhang et al [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">Open AI</td><td align="char" char="." valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Makhoul et al [<xref ref-type="bibr" rid="ref11">11</xref>]</td><td align="left" valign="top">GPT-3.5</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Application-based ChatGPT 3.5</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Pillai et al [<xref ref-type="bibr" rid="ref12">12</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">August 12, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Levin et al [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">Claude-2.0</td><td align="left" valign="top">Anthropic</td><td align="left" valign="top">Platform developed by Anthropic (@Poe)</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Lyons et al [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">March 19-24, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Sarangi et al [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">Bing</td><td align="left" valign="top">Microsoft</td><td align="left" valign="top">Search engine-based GPT-4</td><td align="left" valign="top">June 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Pasl&#x0131; et al [<xref ref-type="bibr" rid="ref16">16</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">September 25, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">RAG</td></tr><tr><td align="left" valign="top">Wang et al [<xref ref-type="bibr" rid="ref17">17</xref>]</td><td align="left" valign="top">GPT-4</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Platform-based GPT-4 developed by researchers (ThyroAIGuide)</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Huang et al [<xref ref-type="bibr" rid="ref18">18</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">May 12, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Stoneham et al [<xref ref-type="bibr" rid="ref19">19</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref20">20</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">April 10, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Horiuchi et al [<xref ref-type="bibr" rid="ref21">21</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">September 25, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Mitsuyama et al [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">May 24, 2024</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref23">23</xref>]</td><td align="left" valign="top">Bard</td><td align="left" valign="top">Google</td><td align="left" valign="top">Online access</td><td align="left" valign="top">June 8, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Suh et al [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">GPT-4V</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Temperature=1</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Fraser et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">July 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">January 5, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Shemer et al [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">March 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Mohammadi et al [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">December 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Arslan et al [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Rojas-Carabali et al [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Kaya et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">March to July 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Delsoz et al [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Ming et al [<xref ref-type="bibr" rid="ref33">33</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">March 5-18, 2024</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Nakaura et al [<xref ref-type="bibr" rid="ref34">34</xref>]</td><td align="left" valign="top">Bing</td><td align="left" valign="top">Microsoft</td><td align="left" valign="top">Search engine-based GPT-4</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Ito et al [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">March 15, 2023</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Gunes et al [<xref ref-type="bibr" rid="ref36">36</xref>]</td><td align="left" valign="top">Claude 3 Opus</td><td align="left" valign="top">Anthropic</td><td align="left" valign="top">Online access</td><td align="left" valign="top">May 2024</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Delsoz et al [<xref ref-type="bibr" rid="ref37">37</xref>]</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">&#x2014;</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Liu et al [<xref ref-type="bibr" rid="ref38">38</xref>]</td><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Open AI</td><td align="left" valign="top">Online access</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Temperature=0</td><td align="left" valign="top">Unused</td></tr><tr><td align="left" valign="top">Li et al [<xref ref-type="bibr" rid="ref39">39</xref>]</td><td align="left" valign="top">Claude 3.5 Sonnet</td><td align="left" valign="top">Anthropic</td><td align="left" valign="top">Online access</td><td align="left" valign="top">June 13 to July 5, 2024</td><td align="left" valign="top">Temperature=1&#x00D7;10<sup>-10</sup></td><td align="left" valign="top">RAG</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table2fn2"><p><sup>b</sup>RAG: retrieval-augmented generation.</p></fn><fn id="table2fn3"><p><sup>c</sup>Not available.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Results of Diagnosis</title><p>The accuracy of the diagnoses made by the LLMs and the clinical professionals in the studies depends on the &#x201C;standard answer&#x201D; mentioned in the literature. The comparison is based on how their answers align with this standard. The &#x201C;standard answer&#x201D; in the included studies consists of the final diagnoses recorded in patient medical records or case reports, predetermined answers set by case developers, and diagnoses established by experienced clinical experts in the relevant departments.</p></sec><sec id="s3-6"><title>Application of LLMs in Clinical Diagnosis</title><p>The most common model task was the free text task, which appeared in 19 articles, while only 1 article involved a choice task. English was used for input and output in all but 2 articles: one used Hebrew for prompting, and the other used Chinese to compare model diagnostic performance. In LLM, prompt is an input mode that guides the model to specific tasks or generates specific outputs, typically including elements such as instructions (task descriptions), context (background information), examples, input data, output instructions, and roles [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. When LLMs are used for case diagnosis, the most frequently used elements are commands and input data, which primarily include patient basic information, complaints, medical history, physical examination, and laboratory tests. The output content mainly consists of diagnostic lists or triage recommendations. The diagnostic accuracy of health care professionals in each study was evaluated by investigators or experts in relevant fields.</p><p>In studies where multiple LLMs were used to diagnose sample cases, only the data for the model with the best diagnostic performance were recorded. Of these studies, 85% (24/30) reported that the ChatGPT series models demonstrated the best diagnostic performance. Several investigators noted that the diagnostic accuracy of GPT models was comparable with that of physicians and did not show significant differences. Specific details can be found in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec><sec id="s3-7"><title>Comparison of Diagnostic Accuracy Between LLMs and Health Care Professionals</title><p>Pooling the data revealed that 70% (21/30) of the studies used the frequency of correct diagnoses in model responses as the primary evaluation indicator of clinical diagnostic accuracy, excluding other auxiliary indicators. All accuracy results were expressed as percentages. For the optimal model, the accuracy of the primary diagnosis ranged from 25% to 97.8%, while triage accuracy ranged from 66.5% to 98%. In medical practice, the diagnostic agreement criterion is usually set at over 80%. The GPT series LLMs achieved diagnostic accuracy greater than 80% in clinical tasks across 3 studies in ophthalmology, 2 studies in general medicine, and 1 study each in radiology, emergency medicine, and general practice. Among the 7 studies focused on ophthalmic case diagnosis, the diagnostic performance was generally high, with 77.8% (7/9) of the large models showing diagnostic accuracy comparable with that of health care professionals.</p><p>In these cases, health care professionals received the same prompting words as the LLMs. In 60% (18/30) of the studies, control group participants were blinded to the true nature and goals of the study until it was completed. The diagnostic accuracy of health care professionals was compared with the outcomes of LLMs. The results showed that in 33.7% (20/30) of the studies, professionals had higher diagnostic accuracy than the models. In 33.3% (10/30) of the studies, the LLMs, specifically ChatGPT, had higher diagnostic accuracy than humans. The specific diagnostic accuracy comparisons are detailed in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of diagnostic accuracy between large language models (LLMs) and clinical professionals.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Specialty and study</td><td align="left" valign="bottom">Clinical professionals</td><td align="left" valign="bottom" colspan="6">Evaluation results (LLMs vs clinical professionals), %</td></tr><tr><td align="left" valign="bottom" colspan="2"/><td align="left" valign="bottom"/><td align="left" valign="bottom">a<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">b<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">c<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="bottom">d<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="bottom">e<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="bottom">f<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" colspan="9">Ophthalmology</td></tr><tr><td align="left" valign="top" rowspan="9"/><td align="left" valign="top">Zhang et al [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">3</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">55 vs 74.7</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Lyons et al [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">8</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">93 vs 95<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">98.0 vs 86.0</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Huang et al [<xref ref-type="bibr" rid="ref18">18</xref>]</td><td align="left" valign="top">15</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">50.4 vs 50.3</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Shemer et al [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">6</td><td align="left" valign="top">68 vs 90</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Rojas-Carabali et al [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">5</td><td align="left" valign="top">64 vs 85.6</td><td align="left" valign="top">72 vs 89.6<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Delsoz et al [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">3</td><td align="left" valign="top">85 vs 96.7</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Ming et al [<xref ref-type="bibr" rid="ref33">33</xref>]</td><td align="left" valign="top">3</td><td align="left" valign="top">59.6 vs 60.6</td><td align="left" valign="top">76 vs 65.4<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Delsoz et al [<xref ref-type="bibr" rid="ref37">37</xref>]</td><td align="left" valign="top">3</td><td align="left" valign="top">72.7 vs 66.6</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Liu et al [<xref ref-type="bibr" rid="ref38">38</xref>]</td><td align="left" valign="top">2</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">80.1 vs 89.4</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Internal medicine</td></tr><tr><td align="left" valign="top" rowspan="6"/><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref20">20</xref>]</td><td align="left" valign="top">3</td><td align="left" valign="top">60 vs 50</td><td align="left" valign="top">81 vs 67<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup>; 83 vs 75<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Mitsuyama et al [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top">5</td><td align="left" valign="top">73 vs 69.4</td><td align="left" valign="top">94 vs 81.6<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref23">23</xref>]</td><td align="left" valign="top">5</td><td align="left" valign="top">40.2 vs 64.6</td><td align="left" valign="top">53.7 vs 78<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup>; 56.1 vs 82.9<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Hirosawa et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">2</td><td align="left" valign="top">53.3 vs 93.3</td><td align="left" valign="top">83.3 vs 98.3<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Nakaura et al [<xref ref-type="bibr" rid="ref34">34</xref>]</td><td align="left" valign="top">1</td><td align="left" valign="top">54 vs 100</td><td align="left" valign="top">96 vs 100<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Li et al [<xref ref-type="bibr" rid="ref39">39</xref>]</td><td align="left" valign="top">5</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">93.8 vs 99.6</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Emergency department</td></tr><tr><td align="left" valign="top" rowspan="3"/><td align="left" valign="top">Sinan Pasl&#x0131; et al [<xref ref-type="bibr" rid="ref16">16</xref>]</td><td align="left" valign="top">Team</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">95.6 vs 92.8</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Fraser et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">3</td><td align="left" valign="top">40 vs 47</td><td align="left" valign="top">63 vs 69<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Arslan et al [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">Team</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">66.5 vs 65.2</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">General medicine</td></tr><tr><td align="left" valign="top" rowspan="3"/><td align="left" valign="top">Sarangi et al [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">2</td><td align="left" valign="top">53.3 vs 60.4</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Suh et al [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">8</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">48.9 vs 60.5<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Ito et al [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">3</td><td align="left" valign="top">97.8 vs 91.1</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">66.7 vs 66.7</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Orthopedics</td></tr><tr><td align="left" valign="top" rowspan="2"/><td align="left" valign="top">Horiuchi et al [<xref ref-type="bibr" rid="ref21">21</xref>]</td><td align="left" valign="top">2</td><td align="left" valign="top">43 vs 47</td><td align="left" valign="top">58 vs 62.5<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Mohammadi et al [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">2</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">0.73 vs 0.74</td></tr><tr><td align="left" valign="top" colspan="9">Cardiothoracic</td></tr><tr><td align="left" valign="top" rowspan="2"/><td align="left" valign="top">Kaya et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">3</td><td align="left" valign="top">81 vs 91.3</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">85 vs 92.7</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Gunes et al [<xref ref-type="bibr" rid="ref36">36</xref>]</td><td align="left" valign="top">2</td><td align="left" valign="top">70.3 vs 46.8</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Otolaryngology</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Makhoul et al [<xref ref-type="bibr" rid="ref11">11</xref>]</td><td align="left" valign="top">20</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">70.8 vs 71.3<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Immunology</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Pillai et al [<xref ref-type="bibr" rid="ref12">12</xref>]</td><td align="left" valign="top">1</td><td align="left" valign="top">25 vs 47.5</td><td align="left" valign="top">45 vs 60<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup>; 47.5 vs 75<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Neonatology</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Levin et al [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">32</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">70.8 vs 82.5</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Thyroid</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Wang et al [<xref ref-type="bibr" rid="ref17">17</xref>]</td><td align="left" valign="top">40</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">73.6 vs 87.4</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="9">Dermatology</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Stoneham et al [<xref ref-type="bibr" rid="ref19">19</xref>]</td><td align="left" valign="top">1</td><td align="left" valign="top">56 vs 83</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Frequency of correct primary diagnosis (answer)</p></fn><fn id="table3fn2"><p><sup>b</sup>Frequency of correct diagnosis in the 3, 5, or 10 differential diagnoses.</p></fn><fn id="table3fn3"><p><sup>c</sup>Accuracy score</p></fn><fn id="table3fn4"><p><sup>d</sup>Triage accuracy</p></fn><fn id="table3fn5"><p><sup>e</sup><italic>F</italic><sub>1</sub>-score.</p></fn><fn id="table3fn6"><p><sup>f</sup>AUC: area under the curve.</p></fn><fn id="table3fn7"><p><sup>g</sup>Not available.</p></fn><fn id="table3fn8"><p><sup>h</sup>Frequency of correct diagnosis in the 3 differential diagnoses.</p></fn><fn id="table3fn9"><p><sup>i</sup>Frequency of correct diagnosis in the 5 differential diagnoses.</p></fn><fn id="table3fn10"><p><sup>j</sup>Frequency of correct diagnosis in the 10 differential diagnoses.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-8"><title>Meta-Analysis</title><p>Although this paper synthesizes over 4000 clinical cases, these cases exhibit significant heterogeneity in terms of clinical departments, diagnostic methodologies, and evaluation metrics. Due to these inherent differences, only 18 studies that used primary diagnostic accuracy as the evaluation metric were included in a meta-analysis. The analysis revealed that clinical professionals generally outperformed LLMs in diagnostic accuracy across various conditions, as shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. The <italic>P</italic> value was less than 0.05, and the <italic>I</italic>&#x00B2; value was 77%, indicating significant heterogeneity among the studies. Sensitivity analysis did not significantly improve the heterogeneity. Subgroup analyses by clinical department showed reduced heterogeneity in ophthalmology-related research, yet results still favored the diagnostic accuracy of ophthalmology professionals over LLMs.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Forest plot comparing diagnostic accuracy of large language models (LLMs) and clinical professionals [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref19">19</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e64963_fig02.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this systematic review, we analyzed the diagnostic accuracy of LLMs compared with clinical professionals, encompassing various LLMs and common medical specialties. Although the results typically indicated superior diagnostic accuracy among professionals, this study compiled the methodologies, functionalities, and outcomes of using LLMs in medical diagnostics. It affirmed the diagnostic capabilities of generic LLMs, providing evidence for their potential as healthcare assistants.</p></sec><sec id="s4-2"><title>Application of LLMs in Clinical Diagnosis Still in Exploratory Stage</title><p>This review includes only peer-reviewed and published literature, so the models examined in the included studies primarily use text-based input and output for diagnostic tasks. However, with the advancement of large models, multimodal capabilities have also been integrated [<xref ref-type="bibr" rid="ref43">43</xref>]. Some preprint studies [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>] have explored using GPT-4V, incorporating imaging data into input prompts. Notably, adding images to LLM did not improve diagnostic performance. In a study by Horiuchi et al [<xref ref-type="bibr" rid="ref44">44</xref>], ChatGPT-4, which relied solely on text prompts, achieved higher diagnostic accuracy compared to GPT-4V, which combined text and images. Without few-shot learning, LLMs may struggle with image recognition and interpretation, sometimes leading to counterproductive outcomes.</p><p>Currently, the performance of general LLMs continues to improve, showing strong results in health care question answering, text classification, and clinical concept extraction [<xref ref-type="bibr" rid="ref46">46</xref>]. However, these studies remain experimental and laboratory-based. Issues such as the interpretability of model responses and medical ethics pose significant challenges to applying these models in real clinical settings. Furthermore, the trust and acceptance of AI models by clinicians directly affect their adoption and implementation. Therefore, education and training programs are crucial for enhancing physicians&#x2019; AI literacy [<xref ref-type="bibr" rid="ref47">47</xref>].</p></sec><sec id="s4-3"><title>Evolution of Artificial Intelligence in Clinical Diagnosis</title><p>The evolution of AI in clinical diagnosis has progressed from simple specialized systems to complex deep learning models. Early AI systems were based on fixed rules and expert knowledge bases. While these systems achieved some success in specific tasks, they had limited scalability and flexibility. The advancement of deep learning technologies, particularly the emergence of LLMs, has ushered AI applications in the health care sector into a new era [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>].</p><p>LLM can learn from vast amounts of medical data to autonomously discover and summarize diagnostic rules, significantly enhancing diagnostic accuracy and reliability. The development of RAG technology and fine-tuning techniques has further enabled LLM to acquire advanced domain expertise and effectively perform specialized tasks.</p></sec><sec id="s4-4"><title>Ethics of Artificial Intelligence in Clinical Diagnosis</title><p>Although the pace of artificial intelligence development is swift, its broad implementation in clinical settings continues to encounter numerous obstacles, including concerns over data privacy, accountability, and ethics. Consequently, numerous scholars [<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref53">53</xref>] underscore the imperative of utmost caution in using these technologies. Advances in the future will necessitate not only technological innovations but also comprehensive enhancements in legal and ethical frameworks to ensure that AI technology is safely and effectively woven into clinical diagnostic processes. In deploying LLMs within actual clinical workflows, it is crucial to first guarantee the transparency of all used data and secure patients&#x2019; informed consent. In addition, to tackle potential biases within AI models, periodic audits are advised to identify and amend any discrepancies. Furthermore, to safeguard patient safety and adhere to regulatory demands, medical institutions should work alongside legal and ethical experts to establish stringent guidelines and oversight mechanisms for AI use. For instance, forming an ethics committee to assess and monitor AI applications could ensure compliance with ethical standards and legal requirements. These targeted measures are essential to surmount existing challenges and foster the successful incorporation of AI technologies in clinical diagnostics.</p></sec><sec id="s4-5"><title>Application of LLMs in Specific Medical Fields</title><p>The application of LLMs in the medical field is gradually expanding, especially in imaging diagnosis, clinical decision support, and personalized treatment planning. Due to their specific needs and challenges, each medical field shows different ways and effects of LLMs&#x2019; application.</p><p>Ophthalmology is one of the pioneers of LLMs&#x2019; applications. In ophthalmic diagnosis, imaging data such as fundus images, retinal scans are typically complex, but LLMs excel in processing and analyzing these types of data [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref54">54</xref>]. Research has shown that LLMs can identify minor lesions in fundus images and diagnose conditions such as glaucoma and macular degeneration [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. Ophthalmic diagnostics rely not only on imaging but also on additional data such as patients&#x2019; genetic information and blood sugar levels. In the future, LLMs could integrate these multimodal data to achieve more accurate disease predictions through personalized treatment. Particularly in resource-limited areas, easily accessible LLMs with low usage thresholds could replace some ophthalmologists in preliminary screenings, further providing efficient diagnostic support in remote regions.</p><p>There are many internal medicine diseases that require long-term follow-up and monitoring. LLMs can process all historical data of patients simultaneously and updating personalized treatment plans, assisting clinical professionals in making more beneficial decisions [<xref ref-type="bibr" rid="ref57">57</xref>-<xref ref-type="bibr" rid="ref59">59</xref>]. In the future, LLMs will be paired with wearable devices to monitor patients&#x2019; health in real time, predict potential risks through data analysis, and provide early intervention for patients with medical diseases, thereby reducing the incidence and recurrence of the disease.</p><p>In the fields of otolaryngology [<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref61">61</xref>] and dermatology [<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref63">63</xref>], LLMs have been used to analyze imaging data for detecting lesions in respective areas. The latest models now offer voice input features, allowing patients to use the model anytime and anywhere to help in the early detection of speech disorders and vocal cord issues. In the future, integrating voice recognition with physiological data can also assist physicians in more accurately locating lesion areas during otolaryngological surgeries, thus improving treatment efficacy. Furthermore, by combining images of skin lesions with patients&#x2019; genetic data, LLMs can help predict the risk of dermatological conditions and provide early warnings.</p></sec><sec id="s4-6"><title>Exploration of the Use of LLMs in Various Clinical Departments</title><p>Currently, extensive research in fields such as ophthalmology, internal medicine, and radiology has demonstrated the substantial potential of LLMs in clinical diagnostics and pathological analysis. These models have even been implemented in some hospitals. Many clinical professionals are actively exploring how to integrate these technologies into their daily diagnostic and treatment routines.</p><p>However, the application of LLMs in other specialized areas remains limited, and research in these fields appears to be lacking. Several reasons account for this disparity: First, the departments mentioned above primarily focus on diagnostic issues, providing rich training data for large models, especially in terms of imaging and case data. Second, the main challenges these departments face in clinical practice, such as accurate diagnosis and disease prediction, are areas where LLMs can excel. In contrast, other departments such as surgery, although also using imaging data, face complexities in surgical and procedural tasks that hinder the maturity of AI applications. Gynecology has seen some applications of image recognition, but lacks depth in research and sufficient data accumulation, making model training challenging. In addition, real-world factors such as data privacy protection and technology dissemination also restrict the application of large models in certain departments.</p></sec><sec id="s4-7"><title>Future Directions</title><p>The &#x201C;human-AI collaboration&#x201D; model involves an initial diagnosis provided by AI, which is then reviewed and confirmed by clinicians. AI&#x2019;s capability to analyze clinical data in real time enables it to offer personalized monitoring plans based on the specific conditions of patients. This continuous tracking of patient health and treatment outcomes helps achieve the goals of personalized medicine and precise diagnosis [<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref65">65</xref>]. In addition, AI can provide customized services and recommendations based on user preferences and backgrounds, enhancing user experience and effectiveness. This model combines the rapid processing capabilities of AI with the expert judgment of clinicians, improving the efficiency and reliability of clinical trials. It also enhances data analysis and patient management, offering significant advantages in cost reduction, resource use, and ensuring the reliability of trial results.</p><p>Although LLMs are not inherently designed for clinical diagnostic tasks, advancements in technology and data accumulation are expected to improve their performance in clinical settings. Techniques such as large-scale medical literature analysis, specific clinical data training, task-specific fine-tuning, personalized training for particular scenarios, and integration with APIs or other supplementary software tools are anticipated to enhance the diagnostic support and treatment recommendations provided by these models [<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref67">67</xref>]. Hybrid models could be developed by combining rule-based clinical decision support systems with the pattern recognition capabilities of LLMs. For example, Vision China 2023 introduced Eye GPT [<xref ref-type="bibr" rid="ref68">68</xref>], a system that integrates ophthalmic medical knowledge with LLM. This system aims to assist clinicians in disease diagnosis and improve medical efficiency by combining extensive ophthalmic information with powerful computational capabilities. This innovation in integrating large models with specialized clinical fields is expected to play a crucial role in future clinical applications and provide research directions for other medical specialties.</p></sec><sec id="s4-8"><title>Limitations</title><p>This study has several limitations. First, the inclusion criteria restricted the review to studies comparing the diagnostic accuracy of LLMs with that of clinical health care professionals using case groups. This limitation may affect the comprehensiveness of the review and introduce selection bias. In addition, there is no specialized tool for assessing the risk of bias in literature related to LLMs. Although PROBAST was used to evaluate the quality of the included studies, its focus on diagnostic accuracy may influence the evaluation results. Finally, significant heterogeneity among the studies was observed, with variations in outcome measures potentially related to differences in intervention subjects, prompt inputs, and information modalities. Further exploration of LLM diagnostic performance is needed through large-scale, multicenter, and high-quality cross-sectional and cohort studies.</p></sec><sec id="s4-9"><title>Conclusions</title><p>This systematic review included 20 studies comparing the diagnostic accuracy of LLMs with that of health care professionals, encompassing various generative AI models and medical specialties. The findings indicate that while LLMs still have a long way to go in accurately diagnosing real-world clinical scenarios and currently lack the level of understanding of human experts, they undeniably possess significant potential as health care assistants. With ongoing advancements and optimizations in technology, it is anticipated that LLMs will play an increasingly important role in future clinical diagnostics.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CINAHL</term><def><p>Cumulative Index to Nursing and Allied Health Literature</p></def></def-item><def-item><term id="abb3">CNKI</term><def><p>China National Knowledge Infrastructure</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MeSH</term><def><p>Medical Subject Heading</p></def></def-item><def-item><term id="abb6">PRISMA-DTA</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-analysis of Diagnostic Test Accuracy Studies</p></def></def-item><def-item><term id="abb7">PROBAST</term><def><p>Prediction Model Risk of Bias Assessment Tool</p></def></def-item><def-item><term id="abb8">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><conf-name>NIPS&#x2019;17: Proceedings of the 31st International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 4-9, 2017</conf-date><conf-loc>Long Beach California USA</conf-loc></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wentao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ruixiao</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tianxiang</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Large language models: principles, implementation, and progress</article-title><source>J Comp Res Dev</source><issue>2</issue><fpage>351</fpage><lpage>361</lpage><pub-id pub-id-type="doi">10.7544/issn1000-1239.202330303</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Research on a massively large artificial intelligence model and its application in medicine</article-title><source>Sci Sin-Vitae</source><year>2024</year><month>01</month><day>1</day><volume>54</volume><pub-id pub-id-type="doi">10.1360/SSV-2022-0298</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><article-title>Our stubborn quest for diagnostic certainty</article-title><source>N Engl J Med</source><year>1989</year><month>11</month><day>2</day><volume>321</volume><issue>18</issue><fpage>1272</fpage><lpage>1273</lpage><pub-id pub-id-type="doi">10.1056/NEJM198911023211820</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nour</surname><given-names>M</given-names> </name><name name-style="western"><surname>Senturk</surname><given-names>U</given-names> </name><name name-style="western"><surname>Polat</surname><given-names>K</given-names> </name></person-group><article-title>Diagnosis and classification of Parkinson&#x2019;s disease using ensemble learning and 1D-PDCovNN</article-title><source>Comput Biol Med</source><year>2023</year><month>07</month><volume>161</volume><fpage>107031</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107031</pub-id><pub-id pub-id-type="medline">37211002</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McInnes</surname><given-names>MDF</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Thombs</surname><given-names>BD</given-names> </name><etal/></person-group><article-title>Preferred Reporting Items for a Systematic Review and Meta-analysis of Diagnostic Test Accuracy Studies: the PRISMA-DTA statement</article-title><source>JAMA</source><year>2018</year><month>01</month><day>23</day><volume>319</volume><issue>4</issue><fpage>388</fpage><lpage>396</lpage><pub-id pub-id-type="doi">10.1001/jama.2017.19163</pub-id><pub-id pub-id-type="medline">29362800</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wolff</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Riley</surname><given-names>RD</given-names> </name><etal/></person-group><article-title>PROBAST: a tool to assess the risk of bias and applicability of prediction model studies</article-title><source>Ann Intern Med</source><year>2019</year><month>01</month><day>1</day><volume>170</volume><issue>1</issue><fpage>51</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.7326/M18-1376</pub-id><pub-id pub-id-type="medline">30596875</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Takita</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kabata</surname><given-names>D</given-names> </name><name name-style="western"><surname>Walston</surname><given-names>SL</given-names> </name><etal/></person-group><article-title>Diagnostic performance comparison between generative AI and physicians: a systematic review and meta-analysis</article-title><source>medRxiv</source><comment>Preprint posted online on  Mar 18, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.01.20.24301563</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name><etal/></person-group><article-title>A comparative study of GPT-4o and human ophthalmologists in glaucoma diagnosis</article-title><source>Sci Rep</source><year>2024</year><volume>14</volume><issue>1</issue><fpage>30385</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-80917-x</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Makhoul</surname><given-names>M</given-names> </name><name name-style="western"><surname>Melkane</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Khoury</surname><given-names>PE</given-names> </name><name name-style="western"><surname>Hadi</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Matar</surname><given-names>N</given-names> </name></person-group><article-title>A cross-sectional comparative study: ChatGPT 3.5 versus diverse levels of medical experts in the diagnosis of ENT diseases</article-title><source>Eur Arch Otorhinolaryngol</source><year>2024</year><month>05</month><volume>281</volume><issue>5</issue><fpage>2717</fpage><lpage>2721</lpage><pub-id pub-id-type="doi">10.1007/s00405-024-08509-z</pub-id><pub-id pub-id-type="medline">38365990</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pillai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pillai</surname><given-names>K</given-names> </name></person-group><article-title>Accuracy of generative artificial intelligence models in differential diagnoses of familial Mediterranean fever and deficiency of Interleukin-1 receptor antagonist</article-title><source>J Transl Autoimmun</source><year>2023</year><month>12</month><volume>7</volume><fpage>100213</fpage><pub-id pub-id-type="doi">10.1016/j.jtauto.2023.100213</pub-id><pub-id pub-id-type="medline">37927888</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kagan</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rosen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Saban</surname><given-names>M</given-names> </name></person-group><article-title>An evaluation of the capabilities of language models and nurses in providing neonatal clinical decision support</article-title><source>Int J Nurs Stud</source><year>2024</year><month>07</month><volume>155</volume><fpage>104771</fpage><pub-id pub-id-type="doi">10.1016/j.ijnurstu.2024.104771</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lyons</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Arepalli</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Fromal</surname><given-names>O</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>N</given-names> </name></person-group><article-title>Artificial intelligence chatbot performance in triage of ophthalmic conditions</article-title><source>Can J Ophthalmol</source><year>2024</year><month>08</month><volume>59</volume><issue>4</issue><fpage>e301</fpage><lpage>e308</lpage><pub-id pub-id-type="doi">10.1016/j.jcjo.2023.07.016</pub-id><pub-id pub-id-type="medline">37572695</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarangi</surname><given-names>PK</given-names> </name><name name-style="western"><surname>Narayan</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Mohakud</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vats</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sahani</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mondal</surname><given-names>H</given-names> </name></person-group><article-title>Assessing the capability of ChatGPT, Google Bard, and Microsoft Bing in solving radiology case vignettes</article-title><source>Indian J Radiol Imaging</source><year>2024</year><month>04</month><volume>34</volume><issue>2</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="doi">10.1055/s-0043-1777746</pub-id><pub-id pub-id-type="medline">38549897</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pasl&#x0131;</surname><given-names>S</given-names> </name><name name-style="western"><surname>&#x015E;ahin</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Be&#x015F;er</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Top&#x00E7;uo&#x011F;lu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yadigaro&#x011F;lu</surname><given-names>M</given-names> </name><name name-style="western"><surname>&#x0130;mamo&#x011F;lu</surname><given-names>M</given-names> </name></person-group><article-title>Assessing the precision of artificial intelligence in ED triage decisions: insights from a study with ChatGPT</article-title><source>Am J Emerg Med</source><year>2024</year><month>04</month><volume>78</volume><fpage>170</fpage><lpage>175</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.01.037</pub-id><pub-id pub-id-type="medline">38295466</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Traverso</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dekker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Qian</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>P</given-names> </name></person-group><article-title>Assessing the role of GPT-4 in thyroid ultrasound diagnosis and treatment recommendations: enhancing interpretability with a chain of thought approach</article-title><source>Quant IMAGING Med Surg</source><year>2024</year><month>02</month><day>1</day><volume>14</volume><issue>2</issue><fpage>1602</fpage><lpage>1615</lpage><pub-id pub-id-type="doi">10.21037/qims-23-1180</pub-id><pub-id pub-id-type="medline">38415150</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Hirabayashi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Barna</surname><given-names>L</given-names> </name><name name-style="western"><surname>Parikh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pasquale</surname><given-names>LR</given-names> </name></person-group><article-title>Assessment of a large language model&#x2019;s responses to questions and cases about glaucoma and retina management</article-title><source>JAMA Ophthalmol</source><year>2024</year><month>04</month><day>1</day><volume>142</volume><issue>4</issue><fpage>371</fpage><lpage>375</lpage><pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.6917</pub-id><pub-id pub-id-type="medline">38386351</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stoneham</surname><given-names>S</given-names> </name><name name-style="western"><surname>Livesey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>H</given-names> </name><name name-style="western"><surname>Mitchell</surname><given-names>C</given-names> </name></person-group><article-title>ChatGPT versus clinician: challenging the diagnostic capabilities of artificial intelligence in dermatology</article-title><source>Clin Exp Dermatol</source><year>2024</year><month>06</month><day>25</day><volume>49</volume><issue>7</issue><fpage>707</fpage><lpage>710</lpage><pub-id pub-id-type="doi">10.1093/ced/llad402</pub-id><pub-id pub-id-type="medline">37979201</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kawamura</surname><given-names>R</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>ChatGPT-generated differential diagnosis lists for complex case-derived clinical vignettes: diagnostic accuracy evaluation</article-title><source>JMIR Med Inform</source><year>2023</year><month>10</month><day>9</day><volume>11</volume><fpage>e48808</fpage><pub-id pub-id-type="doi">10.2196/48808</pub-id><pub-id pub-id-type="medline">37812468</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horiuchi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Tatekawa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Oura</surname><given-names>T</given-names> </name><etal/></person-group><article-title>ChatGPT&#x2019;s diagnostic performance based on textual vs. visual information compared to radiologists&#x2019; diagnostic performance in musculoskeletal radiology</article-title><source>Eur Radiol</source><year>2025</year><month>01</month><volume>35</volume><issue>1</issue><fpage>506</fpage><lpage>516</lpage><pub-id pub-id-type="doi">10.1007/s00330-024-10902-5</pub-id><pub-id pub-id-type="medline">38995378</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mitsuyama</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tatekawa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Takita</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Comparative analysis of GPT-4-based ChatGPT&#x2019;s diagnostic performance with radiologists using real-world radiology reports of brain tumors</article-title><source>Eur Radiol</source><year>2025</year><month>04</month><volume>35</volume><issue>4</issue><fpage>1938</fpage><lpage>1947</lpage><pub-id pub-id-type="doi">10.1007/s00330-024-11032-8</pub-id><pub-id pub-id-type="medline">39198333</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mizuta</surname><given-names>K</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Comparative evaluation of diagnostic accuracy between Google Bard and physicians</article-title><source>Am J Med</source><year>2023</year><month>11</month><volume>136</volume><issue>11</issue><fpage>1119</fpage><lpage>1123</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2023.08.003</pub-id><pub-id pub-id-type="medline">37643659</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Suh</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Shim</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Suh</surname><given-names>CH</given-names> </name><etal/></person-group><article-title>Comparing diagnostic accuracy of radiologists versus GPT-4V and Gemini Pro Vision using image inputs from Diagnosis Please cases</article-title><source>Radiology</source><year>2024</year><month>07</month><volume>312</volume><issue>1</issue><fpage>e240273</fpage><pub-id pub-id-type="doi">10.1148/radiol.240273</pub-id><pub-id pub-id-type="medline">38980179</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fraser</surname><given-names>H</given-names> </name><name name-style="western"><surname>Crossland</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bacher</surname><given-names>I</given-names> </name><name name-style="western"><surname>Ranney</surname><given-names>M</given-names> </name><name name-style="western"><surname>Madsen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hilliard</surname><given-names>R</given-names> </name></person-group><article-title>Comparison of diagnostic and triage accuracy of Ada Health and WebMD symptom checkers, ChatGPT, and physicians for patients in an emergency department: clinical data analysis study</article-title><source>JMIR MHealth UHealth</source><year>2023</year><month>10</month><day>3</day><volume>11</volume><fpage>e49995</fpage><pub-id pub-id-type="doi">10.2196/49995</pub-id><pub-id pub-id-type="medline">37788063</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yokose</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sakamoto</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kawamura</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Diagnostic accuracy of differential-diagnosis lists generated by generative pretrained transformer 3 chatbot for clinical vignettes with common chief complaints: a pilot study</article-title><source>Int J Environ Res Public Health</source><year>2023</year><month>02</month><day>15</day><volume>20</volume><issue>4</issue><fpage>3378</fpage><pub-id pub-id-type="doi">10.3390/ijerph20043378</pub-id><pub-id pub-id-type="medline">36834073</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shemer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Altarescu</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Diagnostic capabilities of ChatGPT in ophthalmology</article-title><source>GRAEFES Arch Clin Exp Ophthalmol</source><year>2024</year><month>07</month><volume>262</volume><issue>7</issue><fpage>2345</fpage><lpage>2352</lpage><pub-id pub-id-type="doi">10.1007/s00417-023-06363-z</pub-id><pub-id pub-id-type="medline">38183467</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohammadi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Parviz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Parvaz</surname><given-names>P</given-names> </name><name name-style="western"><surname>Pirmoradi</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Afzalimoghaddam</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mirfazaelian</surname><given-names>H</given-names> </name></person-group><article-title>Diagnostic performance of ChatGPT in tibial plateau fracture in knee X-ray</article-title><source>Emerg Radiol</source><year>2025</year><month>02</month><volume>32</volume><issue>1</issue><fpage>59</fpage><lpage>64</lpage><pub-id pub-id-type="doi">10.1007/s10140-024-02298-y</pub-id><pub-id pub-id-type="medline">39613920</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arslan</surname><given-names>B</given-names> </name><name name-style="western"><surname>Nuhoglu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Satici</surname><given-names>MO</given-names> </name><name name-style="western"><surname>Altinbilek</surname><given-names>E</given-names> </name></person-group><article-title>Evaluating LLM-based generative AI tools in emergency triage: a comparative study of ChatGPT Plus, Copilot Pro, and triage nurses</article-title><source>Am J Emerg Med</source><year>2025</year><month>03</month><volume>89</volume><fpage>174</fpage><lpage>181</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.12.024</pub-id><pub-id pub-id-type="medline">39731895</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rojas-Carabali</surname><given-names>W</given-names> </name><name name-style="western"><surname>Cifuentes-Gonz&#x00E1;lez</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Evaluating the diagnostic accuracy and management recommendations of ChatGPT in uveitis</article-title><source>Ocul Immunol Inflamm</source><year>2024</year><month>10</month><volume>32</volume><issue>8</issue><fpage>1526</fpage><lpage>1531</lpage><pub-id pub-id-type="doi">10.1080/09273948.2023.2253471</pub-id><pub-id pub-id-type="medline">37722842</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaya</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gietzen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hahnfeldt</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Generative Pre-trained Transformer 4 analysis of cardiovascular magnetic resonance reports in suspected myocarditis: a multicenter study</article-title><source>J Cardiovasc Magn Reson</source><year>2024</year><volume>26</volume><issue>2</issue><fpage>101068</fpage><pub-id pub-id-type="doi">10.1016/j.jocmr.2024.101068</pub-id><pub-id pub-id-type="medline">39079602</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Delsoz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Madadi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Raja</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT in diagnosis of corneal eye diseases</article-title><source>Cornea</source><year>2024</year><month>05</month><day>1</day><volume>43</volume><issue>5</issue><fpage>664</fpage><lpage>670</lpage><pub-id pub-id-type="doi">10.1097/ICO.0000000000003492</pub-id><pub-id pub-id-type="medline">38391243</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ming</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT in ophthalmic registration and clinical diagnosis: cross-sectional study</article-title><source>J Med Internet Res</source><year>2024</year><volume>26</volume><fpage>e60226</fpage><pub-id pub-id-type="doi">10.2196/60226</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nakaura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yoshida</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kobayashi</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Preliminary assessment of automated radiology report generation with generative pre-trained transformers: comparing results to radiologist-generated reports</article-title><source>Jpn J Radiol</source><year>2024</year><month>02</month><volume>42</volume><issue>2</issue><fpage>190</fpage><lpage>200</lpage><pub-id pub-id-type="doi">10.1007/s11604-023-01487-y</pub-id><pub-id pub-id-type="medline">37713022</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ito</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kadomatsu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fujisawa</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The accuracy and potential racial and ethnic biases of GPT-4 in the diagnosis and triage of health conditions: evaluation study</article-title><source>JMIR Med Educ</source><year>2023</year><month>11</month><day>2</day><volume>9</volume><fpage>e47532</fpage><pub-id pub-id-type="doi">10.2196/47532</pub-id><pub-id pub-id-type="medline">37917120</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gunes</surname><given-names>YC</given-names> </name><name name-style="western"><surname>Cesur</surname><given-names>T</given-names> </name></person-group><article-title>The diagnostic performance of large language models and general radiologists in thoracic radiology cases: a comparative study</article-title><source>J Thorac Imaging</source><year>2024</year><month>09</month><day>13</day><pub-id pub-id-type="doi">10.1097/RTI.0000000000000805</pub-id><pub-id pub-id-type="medline">39269227</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Delsoz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Raja</surname><given-names>H</given-names> </name><name name-style="western"><surname>Madadi</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>The use of ChatGPT to assist in diagnosing glaucoma based on clinical case reports</article-title><source>Ophthalmol Ther</source><year>2023</year><month>12</month><volume>12</volume><issue>6</issue><fpage>3121</fpage><lpage>3132</lpage><pub-id pub-id-type="doi">10.1007/s40123-023-00805-x</pub-id><pub-id pub-id-type="medline">37707707</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shao</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Uncovering language disparity of ChatGPT in healthcare: non-English clinical environment for retinal vascular disease classification</article-title><source>medRxiv</source><comment>Preprint posted online on  Jul 14, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.06.28.23291931</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chao</surname><given-names>LI</given-names> </name><name name-style="western"><surname>Youmei</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yani</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yaoping</surname><given-names>C</given-names> </name><name name-style="western"><surname>Xiuzhen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Jie</surname><given-names>Q</given-names> </name></person-group><article-title>Evaluation of the performance of generative artificial intelligence in generating radiology reports</article-title><source>Journal of New Medicine</source><year>2024</year><volume>55</volume><issue>11</issue><fpage>853</fpage><lpage>860</lpage><pub-id pub-id-type="doi">10.3969/j.issn.0253-9802.2024.11.001</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>P</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for ai-generated content: a survey</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 29, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.19473</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name></person-group><article-title>Prompt engineering as an important emerging skill for medical professionals: tutorial</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>4</day><volume>25</volume><fpage>e50638</fpage><pub-id pub-id-type="doi">10.2196/50638</pub-id><pub-id pub-id-type="medline">37792434</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giray</surname><given-names>L</given-names> </name></person-group><article-title>Prompt engineering with ChatGPT: a guide for academic writers</article-title><source>Ann Biomed Eng</source><year>2023</year><month>12</month><volume>51</volume><issue>12</issue><fpage>2629</fpage><lpage>2633</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03272-4</pub-id><pub-id pub-id-type="medline">37284994</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Bressem</surname><given-names>KK</given-names> </name><name name-style="western"><surname>Busch</surname><given-names>F</given-names> </name><name name-style="western"><surname>Nebelung</surname><given-names>S</given-names> </name><name name-style="western"><surname>Truhn</surname><given-names>D</given-names> </name></person-group><article-title>Comparative analysis of multimodal large language model performance on clinical vignette questions</article-title><source>JAMA</source><year>2024</year><month>04</month><day>16</day><volume>331</volume><issue>15</issue><fpage>1320</fpage><lpage>1321</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.27861</pub-id><pub-id pub-id-type="medline">38497956</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Horiuchi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Tatekawa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Oura</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Comparison of the diagnostic accuracy among GPT-4 based ChatGPT, GPT-4V based ChatGPT, and radiologists in musculoskeletal radiology</article-title><source>medRxiv</source><comment>Preprint posted online on  Dec 9, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.12.07.23299707</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kapelushnik</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hecht</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Integrated visual and text-based analysis of ophthalmology clinical cases using a large language model</article-title><source>Sci Rep</source><year>2025</year><month>02</month><day>10</day><volume>15</volume><issue>1</issue><fpage>4999</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-88948-8</pub-id><pub-id pub-id-type="medline">39930078</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mao</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>A survey of large language models for healthcare: from data, technology, and applications to accountability and ethics</article-title><source>Information Fusion</source><year>2025</year><month>06</month><volume>118</volume><fpage>102963</fpage><pub-id pub-id-type="doi">10.1016/j.inffus.2025.102963</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>44</fpage><lpage>56</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id><pub-id pub-id-type="medline">30617339</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Esteva</surname><given-names>A</given-names> </name><name name-style="western"><surname>Robicquet</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ramsundar</surname><given-names>B</given-names> </name><etal/></person-group><article-title>A guide to deep learning in healthcare</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>24</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0316-z</pub-id><pub-id pub-id-type="medline">30617335</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reddy</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fox</surname><given-names>J</given-names> </name><name name-style="western"><surname>Purohit</surname><given-names>MP</given-names> </name></person-group><article-title>Artificial intelligence-enabled healthcare delivery</article-title><source>J R Soc Med</source><year>2019</year><month>01</month><volume>112</volume><issue>1</issue><fpage>22</fpage><lpage>28</lpage><pub-id pub-id-type="doi">10.1177/0141076818815510</pub-id><pub-id pub-id-type="medline">30507284</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name></person-group><article-title>Ethical considerations of using ChatGPT in health care</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>11</day><volume>25</volume><fpage>e48009</fpage><pub-id pub-id-type="doi">10.2196/48009</pub-id><pub-id pub-id-type="medline">37566454</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><etal/></person-group><article-title>The application of large language models in medicine: a scoping review</article-title><source>iScience</source><year>2024</year><month>05</month><day>17</day><volume>27</volume><issue>5</issue><fpage>109713</fpage><pub-id pub-id-type="doi">10.1016/j.isci.2024.109713</pub-id><pub-id pub-id-type="medline">38746668</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>K</given-names> </name><name name-style="western"><surname>Jagadeesh</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The potential and pitfalls of using a large language model such as ChatGPT, GPT-4, or LLaMA as a clinical assistant</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1884</fpage><lpage>1891</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae184</pub-id><pub-id pub-id-type="medline">39018498</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lalmuanawma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hussain</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chhakchhuak</surname><given-names>L</given-names> </name></person-group><article-title>Applications of machine learning and artificial intelligence for Covid-19 (SARS-CoV-2) pandemic: a review</article-title><source>Chaos Solitons Fractals</source><year>2020</year><month>10</month><volume>139</volume><fpage>110059</fpage><pub-id pub-id-type="doi">10.1016/j.chaos.2020.110059</pub-id><pub-id pub-id-type="medline">32834612</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carl&#x00E0;</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Crincoli</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rizzo</surname><given-names>S</given-names> </name></person-group><article-title>Retinal imaging analysis performed by ChatGPT-4O and Gemini Advanced: the turning point of the revolution?</article-title><source>Retina (Philadelphia, Pa)</source><year>2025</year><month>04</month><day>1</day><volume>45</volume><issue>4</issue><fpage>694</fpage><lpage>702</lpage><pub-id pub-id-type="doi">10.1097/IAE.0000000000004351</pub-id><pub-id pub-id-type="medline">39715322</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ghalibafan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Taylor Gonzalez</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>LZ</given-names> </name><etal/></person-group><article-title>Applications of multimodal generative artificial intelligence in a real-world retina clinic setting</article-title><source>Retina (Philadelphia, Pa)</source><year>2024</year><month>10</month><day>1</day><volume>44</volume><issue>10</issue><fpage>1732</fpage><lpage>1740</lpage><pub-id pub-id-type="doi">10.1097/IAE.0000000000004204</pub-id><pub-id pub-id-type="medline">39287535</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raja</surname><given-names>H</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Delsoz</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Diagnosing glaucoma based on the ocular hypertension treatment study dataset using chat generative pre-trained transformer as a large language model</article-title><source>Ophthalmol Sci</source><year>2025</year><volume>5</volume><issue>1</issue><fpage>100599</fpage><pub-id pub-id-type="doi">10.1016/j.xops.2024.100599</pub-id><pub-id pub-id-type="medline">39346574</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuzan</surname><given-names>BN</given-names> </name><name name-style="western"><surname>Me&#x015F;e</surname><given-names>&#x0130;</given-names> </name><name name-style="western"><surname>Ya&#x015F;ar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kuzan</surname><given-names>TY</given-names> </name></person-group><article-title>A retrospective evaluation of the potential of ChatGPT in the accurate diagnosis of acute stroke</article-title><source>Diagn Interv Radiol</source><year>2024</year><month>09</month><day>2</day><pub-id pub-id-type="doi">10.4274/dir.2024.242892</pub-id><pub-id pub-id-type="medline">39221691</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chiang</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Chou</surname><given-names>YC</given-names> </name><name name-style="western"><surname>Tung</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Customized GPT model largely increases surgery decision accuracy for pharmaco-resistant epilepsy</article-title><source>J Clin Neurosci</source><year>2024</year><month>12</month><volume>130</volume><fpage>110918</fpage><pub-id pub-id-type="doi">10.1016/j.jocn.2024.110918</pub-id><pub-id pub-id-type="medline">39541652</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ding</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Thao</surname><given-names>PNM</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>WC</given-names> </name><etal/></person-group><article-title>Large language multimodal models for new-onset type 2 diabetes prediction using five-year cohort electronic health records</article-title><source>Sci Rep</source><year>2024</year><month>09</month><day>6</day><volume>14</volume><issue>1</issue><fpage>20774</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-71020-2</pub-id><pub-id pub-id-type="medline">39237580</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maniaci</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chiesa-Estomba</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Lechien</surname><given-names>JR</given-names> </name></person-group><article-title>ChatGPT-4 consistency in interpreting laryngeal clinical images of common lesions and disorders</article-title><source>Otolaryngol Head Neck Surg</source><year>2024</year><month>10</month><volume>171</volume><issue>4</issue><fpage>1106</fpage><lpage>1113</lpage><pub-id pub-id-type="doi">10.1002/ohn.897</pub-id><pub-id pub-id-type="medline">39045737</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lechien</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Naunheim</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Maniaci</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance and consistency of ChatGPT-4 versus otolaryngologists: a clinical case series</article-title><source>Otolaryngol Head Neck Surg</source><year>2024</year><month>06</month><volume>170</volume><issue>6</issue><fpage>1519</fpage><lpage>1526</lpage><pub-id pub-id-type="doi">10.1002/ohn.759</pub-id><pub-id pub-id-type="medline">38591726</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Gabashvili</surname><given-names>IS</given-names> </name></person-group><article-title>ChatGPT in dermatology: a comprehensive systematic review</article-title><source>medRxiv</source><comment>Preprint posted online on  Jun 12, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.06.11.23291252</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Pillai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Joseph</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Hardin</surname><given-names>J</given-names> </name></person-group><article-title>Evaluating the diagnostic and treatment recommendation capabilities of GPT-4 vision in dermatology</article-title><source>medRxiv</source><comment>Preprint posted online on  Jan 26, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.01.24.24301743</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beam</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>IS</given-names> </name></person-group><article-title>Big data and machine learning in health care</article-title><source>JAMA</source><year>2018</year><month>04</month><day>3</day><volume>319</volume><issue>13</issue><fpage>1317</fpage><lpage>1318</lpage><pub-id pub-id-type="doi">10.1001/jama.2017.18391</pub-id><pub-id pub-id-type="medline">29532063</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajkomar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Oren</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Scalable and accurate deep learning with electronic health records</article-title><source>NPJ Digit Med</source><year>2018</year><volume>1</volume><issue>1</issue><fpage>18</fpage><pub-id pub-id-type="doi">10.1038/s41746-018-0029-1</pub-id><pub-id pub-id-type="medline">31304302</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>He</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><etal/></person-group><article-title>SkinGPT-4: an interactive dermatology diagnostic system with visual large language model</article-title><source>medRxiv</source><comment>Preprint posted online on  Jun 13, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.06.10.23291127</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Betzler</surname><given-names>BK</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>CY</given-names> </name><etal/></person-group><article-title>Large language models and their impact in ophthalmology</article-title><source>Lancet Digit Health</source><year>2023</year><month>12</month><volume>5</volume><issue>12</issue><fpage>e917</fpage><lpage>e924</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00201-7</pub-id><pub-id pub-id-type="medline">38000875</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><etal/></person-group><article-title>EyeGPT: ophthalmic assistant with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 29, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.00840</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Details of the search strategy in PubMed.</p><media xlink:href="medinform_v13i1e64963_app1.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Quality assessment of included studies.</p><media xlink:href="medinform_v13i1e64963_app2.docx" xlink:title="DOCX File, 23 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Characteristics of large language models (LLMs) applied in clinical diagnostic studies.</p><media xlink:href="medinform_v13i1e64963_app3.docx" xlink:title="DOCX File, 28 KB"/></supplementary-material><supplementary-material id="app4"><label>Checklist 1</label><p>PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analysis) checklist.</p><media xlink:href="medinform_v13i1e64963_app4.pdf" xlink:title="PDF File, 93 KB"/></supplementary-material></app-group></back></article>