<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v11i1e48808</article-id>
      <article-id pub-id-type="pmid">37812468</article-id>
      <article-id pub-id-type="doi">10.2196/48808</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>ChatGPT-Generated Differential Diagnosis Lists for Complex Case–Derived Clinical Vignettes: Diagnostic Accuracy Evaluation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Castonguay</surname>
            <given-names>Alexandre</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chrimes</surname>
            <given-names>Dillon</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kopka</surname>
            <given-names>Marvin</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Hirosawa</surname>
            <given-names>Takanobu</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Diagnostic and Generalist Medicine</institution>
            <institution>Dokkyo Medical University</institution>
            <addr-line>880 Kitakobayashi, Mibu-cho</addr-line>
            <addr-line>Shimotsuga</addr-line>
            <addr-line>Tochigi, 321-0293</addr-line>
            <country>Japan</country>
            <phone>81 282861111</phone>
            <email>hirosawa@dokkyomed.ac.jp</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3573-8203</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Kawamura</surname>
            <given-names>Ren</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5632-3218</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Harada</surname>
            <given-names>Yukinori</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6042-7397</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Mizuta</surname>
            <given-names>Kazuya</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-8822-7127</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Tokumasu</surname>
            <given-names>Kazuki</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9513-6864</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Kaji</surname>
            <given-names>Yuki</given-names>
          </name>
          <degrees>MD, MPH</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0267-9876</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Suzuki</surname>
            <given-names>Tomoharu</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5557-0516</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Shimizu</surname>
            <given-names>Taro</given-names>
          </name>
          <degrees>MD, MSc, MPH, MBA, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3788-487X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Diagnostic and Generalist Medicine</institution>
        <institution>Dokkyo Medical University</institution>
        <addr-line>Tochigi</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of General Medicine</institution>
        <institution>Okayama University Graduate School of Medicine, Dentistry and Pharmaceutical Sciences</institution>
        <addr-line>Okayama</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of General Medicine</institution>
        <institution>International University of Health and Welfare Narita Hospital</institution>
        <addr-line>Chiba</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Hospital Medicine</institution>
        <institution>Urasoe General Hospital</institution>
        <addr-line>Okinawa</addr-line>
        <country>Japan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Takanobu Hirosawa <email>hirosawa@dokkyomed.ac.jp</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>9</day>
        <month>10</month>
        <year>2023</year>
      </pub-date>
      <volume>11</volume>
      <elocation-id>e48808</elocation-id>
      <history>
        <date date-type="received">
          <day>9</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>17</day>
          <month>7</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>20</day>
          <month>7</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>13</day>
          <month>9</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Takanobu Hirosawa, Ren Kawamura, Yukinori Harada, Kazuya Mizuta, Kazuki Tokumasu, Yuki Kaji, Tomoharu Suzuki, Taro Shimizu. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 09.10.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2023/1/e48808" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The diagnostic accuracy of differential diagnoses generated by artificial intelligence chatbots, including ChatGPT models, for complex clinical vignettes derived from general internal medicine (GIM) department case reports is unknown.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to evaluate the accuracy of the differential diagnosis lists generated by both third-generation ChatGPT (ChatGPT-3.5) and fourth-generation ChatGPT (ChatGPT-4) by using case vignettes from case reports published by the Department of GIM of Dokkyo Medical University Hospital, Japan.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We searched PubMed for case reports. Upon identification, physicians selected diagnostic cases, determined the final diagnosis, and displayed them into clinical vignettes. Physicians typed the determined text with the clinical vignettes in the ChatGPT-3.5 and ChatGPT-4 prompts to generate the top 10 differential diagnoses. The ChatGPT models were not specially trained or further reinforced for this task. Three GIM physicians from other medical institutions created differential diagnosis lists by reading the same clinical vignettes. We measured the rate of correct diagnosis within the top 10 differential diagnosis lists, top 5 differential diagnosis lists, and the top diagnosis.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>In total, 52 case reports were analyzed. The rates of correct diagnosis by ChatGPT-4 within the top 10 differential diagnosis lists, top 5 differential diagnosis lists, and top diagnosis were 83% (43/52), 81% (42/52), and 60% (31/52), respectively. The rates of correct diagnosis by ChatGPT-3.5 within the top 10 differential diagnosis lists, top 5 differential diagnosis lists, and top diagnosis were 73% (38/52), 65% (34/52), and 42% (22/52), respectively. The rates of correct diagnosis by ChatGPT-4 were comparable to those by physicians within the top 10 (43/52, 83% vs 39/52, 75%, respectively; <italic>P</italic>=.47) and within the top 5 (42/52, 81% vs 35/52, 67%, respectively; <italic>P</italic>=.18) differential diagnosis lists and top diagnosis (31/52, 60% vs 26/52, 50%, respectively; <italic>P</italic>=.43) although the difference was not significant. The ChatGPT models’ diagnostic accuracy did not significantly vary based on open access status or the publication date (before 2011 vs 2022).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study demonstrates the potential diagnostic accuracy of differential diagnosis lists generated using ChatGPT-3.5 and ChatGPT-4 for complex clinical vignettes from case reports published by the GIM department. The rate of correct diagnoses within the top 10 and top 5 differential diagnosis lists generated by ChatGPT-4 exceeds 80%. Although derived from a limited data set of case reports from a single department, our findings highlight the potential utility of ChatGPT-4 as a supplementary tool for physicians, particularly for those affiliated with the GIM department. Further investigations should explore the diagnostic accuracy of ChatGPT by using distinct case materials beyond its training data. Such efforts will provide a comprehensive insight into the role of artificial intelligence in enhancing clinical decision-making.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>artificial intelligence</kwd>
        <kwd>AI chatbot</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>large language models</kwd>
        <kwd>clinical decision support</kwd>
        <kwd>natural language processing</kwd>
        <kwd>diagnostic excellence</kwd>
        <kwd>language model</kwd>
        <kwd>vignette</kwd>
        <kwd>case study</kwd>
        <kwd>diagnostic</kwd>
        <kwd>accuracy</kwd>
        <kwd>decision support</kwd>
        <kwd>diagnosis</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Decision-Making in Health Care</title>
        <p>In health care, accurate diagnosis plays a critical role in the effective management of patients’ conditions [<xref ref-type="bibr" rid="ref1">1</xref>]. Clinicians often rely on their expertise and various case presentations to make clinical decisions. However, the increasing complexity of cases, particularly those requiring referrals to specialized departments such as general internal medicine (GIM), and the rapid expansion of medical knowledge necessitate enhanced diagnostic support. A single-center study reported diagnostic error rates of 2% in an outpatient GIM department [<xref ref-type="bibr" rid="ref2">2</xref>], while a systematic review found that the error rates exceeded by 10% in older adult patients [<xref ref-type="bibr" rid="ref3">3</xref>]. Such inaccuracies underline the pressing need for tools to aid physicians in making more accurate diagnoses [<xref ref-type="bibr" rid="ref4">4</xref>]. One promising avenue being explored is the application of clinical decision support (CDS) systems.</p>
      </sec>
      <sec>
        <title>CDS Tools</title>
        <p>Various CDS systems, including symptom checkers [<xref ref-type="bibr" rid="ref5">5</xref>] and differential diagnosis generators [<xref ref-type="bibr" rid="ref6">6</xref>], have been developed over the years. The former are generally designed for the general public, while the latter are intended for health care providers. The journey of computer-aided health care traces back to the early 1970s, marked by a strong interest in harnessing computing power to enhance care quality. Historically, CDS tools often employ multistep processes that combine logical or computational processes, probability assessments, and heuristic methods. Notably, a combination of algorithms and heuristic rules has been integral to many medical applications [<xref ref-type="bibr" rid="ref7">7</xref>]. There is evidence of CDS tools being utilized in the outpatient department of GIM [<xref ref-type="bibr" rid="ref8">8</xref>]. However, despite the potential of CDS systems to boost diagnostic accuracy and efficiency, they often increase clinicians’ workload [<xref ref-type="bibr" rid="ref9">9</xref>], particularly due to the need for structured input data. This remains a great barrier to their widespread adoption. In this context, artificial intelligence (AI), especially large language models, provides an alternative approach for health care support [<xref ref-type="bibr" rid="ref10">10</xref>], particularly through the AI chatbot [<xref ref-type="bibr" rid="ref11">11</xref>].</p>
      </sec>
      <sec>
        <title>ChatGPT in Health Care</title>
        <p>AI chatbots such as ChatGPT have demonstrated potential in facilitating effective communication between patients and health care providers [<xref ref-type="bibr" rid="ref12">12</xref>] and transforming medical writing [<xref ref-type="bibr" rid="ref13">13</xref>]. ChatGPT, developed by OpenAI, is an application of large language model based on natural language processing, known as a generative pretrained transformer (GPT) [<xref ref-type="bibr" rid="ref14">14</xref>]. It can generate human-like responses to user prompts. With the progression from the third-generation GPT (GPT-3.5) to the fourth-generation GPT (GPT-4), the model’s accuracy has improved in professional examinations [<xref ref-type="bibr" rid="ref15">15</xref>] and multiple-choice problems across various languages [<xref ref-type="bibr" rid="ref16">16</xref>]. Yet, AI chatbots are not exempt from limitations and risks [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. These limitations encompass transparency issues [<xref ref-type="bibr" rid="ref19">19</xref>], nonspecialized medical knowledge, outdated medical information, inherent biases, and a potential to disseminate misinformation [<xref ref-type="bibr" rid="ref11">11</xref>]. Despite these challenges, AI systems such as ChatGPT are continually improving and hold promise as essential tools for achieving diagnostic excellence [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
        <p>To prepare for potential clinical applications of AI chatbots, it is essential to evaluate their diagnostic accuracy, particularly for complex cases that frequently necessitate referral to specialized departments such as the GIM department. If harnessed correctly, generative AI like ChatGPT could reduce the diagnostic errors attributed to the inherent complexity of the GIM domain. This would streamline the department’s workflow, enhancing patient care and outcomes. The study will reveal the potential of generative AIs, including ChatGPT as the CDS, especially in the GIM department.</p>
        <p>Previous studies have reported that the diagnostic accuracy of the differential diagnosis lists generated by ChatGPT for clinical vignettes falls between 64% and 83% [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. A clinical vignette is a concise narrative used in research to present a clinical scenario. However, these earlier studies did not focus on the materials derived from the GIM department, which is known for its diagnostically challenging cases. This gap in the literature accentuates the novelty and distinctiveness of our study. We aimed to evaluate the diagnostic accuracy of the differential diagnosis lists generated by ChatGPT, specifically using clinical vignettes derived from case reports published by the GIM department. By focusing on these GIM case reports, our research potentially offers a more rigorous appraisal of the diagnostic prowess of ChatGPT compared to preceding studies. In line with this, we expect ChatGPT-4 to provide the correct diagnosis in its differential diagnosis lists with an accuracy consistent with or within the previously reported range of 64%-83%.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>We evaluated the diagnostic accuracy of the differential diagnosis lists generated by ChatGPT-3.5 and ChatGPT-4 for clinical vignettes from case reports published by the Department of GIM. The term “differential diagnosis” refers to a list of possible conditions or diseases that could be causing a patient’s symptoms and signs. It is created by considering the patient’s clinical history, physical examination, and the results of any investigations, thus aiding in the diagnostic process. This study was conducted at the GIM Department (Department of Diagnostic and Generalist Medicine) of Dokkyo Medical University Hospital, Shimotsuga, Tochigi, Japan.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Because this study used case vignettes from published case reports, approval by the ethics committee and requirement for individual consent were not required.</p>
      </sec>
      <sec>
        <title>Clinical Vignettes</title>
        <p>We used clinical vignettes from case reports published by the GIM Department of Dokkyo Medical University Hospital. Clinical cases that were challenging to diagnose and typically involved a high level of complexity were often referred to the GIM department. Some of these cases were published as case reports in medical journals. To find case reports published in English from our department, we searched PubMed using the following keywords on March 20, 2023: “(Dokkyo Medical University [affil]) AND (Generalist Medicine [affil]) AND (2016/4/1:2022/12/31 [dp]) AND (Case Reports [PT]).” After finding 54 case reports in PubMed, 2 experienced GIM physicians (TH and RK) checked these case reports for diagnostic or nondiagnostic cases, assessed the final diagnosis, and displayed them as clinical vignettes. Two cases were excluded because they were nondiagnostic. In total, 52 cases were included in this study. For example, consider the case reports titled “Hepatic portal venous gas after diving” [<xref ref-type="bibr" rid="ref23">23</xref>], which is mentioned as case number 3 in Table S1 of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and Table S2 of <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. From this report, we extracted the clinical vignette from the case description section: “A 68-year-old man with diabetes and...There was no evidence of pneumatosis intestinalis.” Decompression sickness was determined as the final diagnosis for this case. These case reports meet the standards required for publication in peer-reviewed journals and have been written and selected by experienced GIM physicians. Each clinical vignette included the clinical history, physical examination, and results of the investigation. The title, abstract, introduction, clinical assessment, differential diagnosis, final diagnosis, figures, legends, tables, and case reports were removed from the vignettes. The final diagnosis for each case, which had been established through the usual diagnostic processes and subsequently published in these case reports, was assessed and displayed in the form of clinical vignettes. The final diagnosis was confirmed by 2 experienced GIM physicians. Discrepancies between the 2 physicians were resolved through discussions. We also assessed the publication date and status of the included case reports as open access.</p>
      </sec>
      <sec>
        <title>Differential Diagnosis Lists Created by Physicians</title>
        <p>The differential diagnosis lists for each clinical vignette were independently created by 3 other GIM physicians (KT, YK, and T Suzuki) not affiliated with Dokkyo Medical University. Each clinical vignette was allocated to 1 physician, resulting in an average of 17 case descriptions being handled by each physician. They were instructed to create the top 10 differential diagnosis lists in English by reading the same clinical vignettes, without consulting other physicians or using CDS tools. It is essential to highlight that the physicians did not adhere to any specific guidelines, criteria, or protocols during this process. They operated based solely on their expertise and experience. Before creating the differential diagnosis lists, they were confirmed to be unaware of the case reports, clinical vignettes, final diagnosis, and differential diagnosis lists generated by ChatGPT-3.5 and ChatGPT-4. The physicians also remained blinded to each other’s assessments. A computer-generated order table determined the sequence in which the clinical vignettes were presented.</p>
      </sec>
      <sec>
        <title>Differential Diagnosis Lists Generated by ChatGPT</title>
        <p>We used ChatGPT, an application of the GPT-3.5 model (March 14 version; ChatGPT-3.5, OpenAI, LLC), on March 20, 2023. We also used ChatGPT, an application of the GPT-4 model (March 23 version; ChatGPT-4, OpenAI, LLC), on April 10, 2023. Neither of the ChatGPT models were specially trained or reinforced for medical diagnoses. The physician (TH) typed the following text in the prompt: “Tell me the top 10 suspected illnesses for the following symptoms: (copy and paste each clinical vignette).” The prompt was designed to encourage the ChatGPT models to generate a list of differential diagnoses. The rationale behind selecting this particular prompt was grounded in preliminary testing. In these tests, various prompts were evaluated for their effectiveness in soliciting a comprehensive list of potential illnesses. This prompt consistently yielded reliable and inclusive differential diagnoses in our initial evaluations.</p>
        <p>To minimize potential bias, the order in which the vignettes were presented to ChatGPT-3.5 and ChatGPT-4 was determined using a computer-generated order table. To ensure no interference from previous responses, physicians cleared the previous conversation before introducing new clinical vignettes. We used the initial answers as the top 10 differential diagnosis lists generated by ChatGPT-3.5 and ChatGPT-4.</p>
      </sec>
      <sec>
        <title>Evaluation of Differential Diagnosis Lists</title>
        <p>Two other GIM physicians (YH and KM) evaluated whether the final diagnosis was included in the differential diagnosis lists created by the physicians and those generated by ChatGPT models. A diagnosis was labeled “1” if it accurately and specifically identified the condition or was sufficiently close to the exact diagnosis that it would enable prompt and appropriate treatment. Conversely, a diagnosis was marked as “0” if it diverged significantly from the actual diagnosis [<xref ref-type="bibr" rid="ref24">24</xref>]. When the final diagnosis was present, the researcher further assessed its ranking within the list. Discrepancies between the 2 evaluators were resolved through discussions. The study design is illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>. Examples of a differential diagnosis list generated by ChatGPT-3.5 and ChatGPT-4 are shown in <xref rid="figure2" ref-type="fig">Figures 2</xref>-<xref rid="figure3" ref-type="fig">3</xref> and <xref rid="figure4" ref-type="fig">Figures 4</xref>-<xref rid="figure5" ref-type="fig">5</xref>, respectively.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Study design.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e48808_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>A differential diagnosis list generated by the third-generation ChatGPT for a sample case.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e48808_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Explanation for the differential diagnosis list generated by the third-generation ChatGPT for a sample case. ChatGPT-3: third-generation ChatGPT.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e48808_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>A differential diagnosis list generated by the fourth-generation ChatGPT for a sample case.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e48808_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Explanation for the differential diagnosis list generated by the fourth-generation ChatGPT for a sample case. ChatGPT4: fourth-generation ChatGPT.</p>
          </caption>
          <graphic xlink:href="medinform_v11i1e48808_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Measurements</title>
        <p>We measured the rate of correct diagnoses within the top 10 differential diagnosis lists, top 5 differential diagnosis lists, and top diagnosis provided by ChatGPT-3.5, ChatGPT-4, and the physicians. As a binary approach, we scored the presence of the final diagnosis on the list as one and its absence as zero. For an exploratory analysis, we compared the rates of correct diagnoses in the lists generated by ChatGPT-3.5 and ChatGPT-4 between case reports that were open access and those that were not. This comparison was motivated by understanding that GPT-3.5 and GPT-4 were primarily learned from open sources available on the internet [<xref ref-type="bibr" rid="ref16">16</xref>]. Given that these models are predominantly trained on openly accessible data, we postulated that open access case reports might yield better diagnostic results than non–open access ones. Additionally, we compared the rates of correct diagnoses within the lists generated by ChatGPT-3.5 and ChatGPT-4 based on the publishing year prior to 2021 or in 2022. This distinction arises from the knowledge cutoffs for ChatGPT-3.5 and ChatGPT-4, which were set in early 2021. Since the models would be more familiar with data before this time and less informed about subsequent publications, we hypothesized that the case reports published in the years prior to 2021 could produce better diagnostic results than those published in 2022. However, the details of the learning data source and cutoff timing were not available to the public.</p>
      </sec>
      <sec>
        <title>Analysis</title>
        <p>Categorical or binary variables were presented as numbers (percentages) and compared using the chi-square test. To mitigate the increased risk of type I error arising from multiple comparisons, we employed the Bonferroni correction [<xref ref-type="bibr" rid="ref25">25</xref>]. Although alternative methods exist, we chose the Bonferroni correction for its strict control over false positives. When conducting multiple comparisons, we set the Bonferroni-corrected significance level at a <italic>P</italic> value &#60;.02. This was derived by dividing .05 (the standard level of significance) by 3 (the number of comparisons undertaken). Both the chi-square test and the computation of the Bonferroni-corrected significance level were conducted in R (version 4.2.2; R Foundation for Statistical Computing) using the stats library (version 4.2.2).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Case Report Profiles</title>
        <p>In total, 52 case reports were included in this study, among which 39 (75%) were open access case reports. A total of 24 (46%) case reports were published prior to 2021. Of the total case reports, 12 (23%) were published in 2021 and 16 (31%) were published in 2022. The included case reports are presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Diagnostic Performance</title>
        <p>Representative examples of differential diagnosis lists with the final diagnosis are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <p>The rates of correct diagnosis by ChatGPT-4 within the top 10 differential diagnoses, top 5 differential diagnoses, and top diagnosis were 83% (43/52), 81% (42/52), and 60% (31/52), respectively (<xref ref-type="table" rid="table2">Table 2</xref>). The rates of correct diagnosis by ChatGPT-3.5 within the top 10 differential diagnoses, top 5 differential diagnoses, and top diagnosis were 73% (38/52), 65% (34/52), and 42% (22/52), respectively. The rates of correct diagnosis by ChatGPT-4 were comparable to those by ChatGPT-3.5 within the top 10 (43/52, 83% vs 38/52, 73%, respectively; <italic>P</italic>=.34) and top 5 (42/52, 81% vs 34/52, 65%, respectively; <italic>P</italic>=.12) differential diagnosis lists and top diagnosis (31/52, 60% vs 22/52, 42%, respectively; <italic>P</italic>=.12), although the difference was not statistically significant. The rates of correct diagnosis by ChatGPT-4 were also comparable to those by physicians within the top 10 (43/52, 83% vs 39/52, 75%, respectively; <italic>P</italic>=.47) and top 5 (42/52, 81% vs 35/52, 67%, respectively; <italic>P</italic>=.18) differential diagnoses and top diagnosis (31/52, 60% vs 26/52, 50%, respectively; <italic>P</italic>=.43), although the difference was not statistically significant. <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> shows the final diagnosis and the 10 differential diagnosis lists generated by ChatGPT-3.5 and ChatGPT-4 and those created by physicians.</p>
        <p>Regarding open access case reports (<xref ref-type="table" rid="table3">Table 3</xref>), there were no statistically significant differences in the rates of correct diagnoses within the top 10 and top 5 differential diagnosis lists and top diagnoses generated by ChatGPT-3.5 and ChatGPT-4. Regarding the timing of publication, there were no statistically significant differences in ChatGPT-3.5 and ChatGPT-4 diagnoses between those published prior to 2021 and those published in 2022 (<xref ref-type="table" rid="table3">Table 3</xref>).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Representative examples of differential diagnosis lists with the final diagnosis.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="260"/>
            <col width="260"/>
            <col width="310"/>
            <thead>
              <tr valign="top">
                <td>Final diagnosis</td>
                <td colspan="3">Differential diagnosis lists</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ChatGPT-3.5<sup>a</sup></td>
                <td>ChatGPT-4<sup>b</sup></td>
                <td>Physicians</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Tarsal tunnel  <break/>  
            syndrome</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Peripheral neuropathy</p>
                    </list-item>
                    <list-item>
                      <p>Tarsal tunnel syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Lumbar spinal stenosis</p>
                    </list-item>
                    <list-item>
                      <p>Diabetes</p>
                    </list-item>
                    <list-item>
                      <p>Alcohol abuse</p>
                    </list-item>
                    <list-item>
                      <p>Vitamin B12 deficiency</p>
                    </list-item>
                    <list-item>
                      <p>Hypothyroidism</p>
                    </list-item>
                    <list-item>
                      <p>Chronic kidney disease</p>
                    </list-item>
                    <list-item>
                      <p>Multiple sclerosis</p>
                    </list-item>
                    <list-item>
                      <p>Charcot-Marie-Tooth disease</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Peripheral neuropathy</p>
                    </list-item>
                    <list-item>
                      <p>Tarsal tunnel syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Lumbar radiculopathy</p>
                    </list-item>
                    <list-item>
                      <p>Small fiber neuropathy</p>
                    </list-item>
                    <list-item>
                      <p>Chronic inflammatory demyelinating polyneuropathy</p>
                    </list-item>
                    <list-item>
                      <p>Complex regional pain syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Peripheral artery disease</p>
                    </list-item>
                    <list-item>
                      <p>Charcot-Marie-Tooth disease</p>
                    </list-item>
                    <list-item>
                      <p>Polyneuropathy</p>
                    </list-item>
                    <list-item>
                      <p>Leprosy</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Sarcoidosis</p>
                    </list-item>
                    <list-item>
                      <p>Chronic inflammatory demyelinating polyradiculoneuropathy</p>
                    </list-item>
                    <list-item>
                      <p>Diabetic neuropathy</p>
                    </list-item>
                    <list-item>
                      <p>Vitamin B12 deficiency</p>
                    </list-item>
                    <list-item>
                      <p>Alcoholic neuropathy</p>
                    </list-item>
                    <list-item>
                      <p>Paraneoplastic syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Antimyelin oligodendrocyte glycoprotein antibody-associated disease antineutrophil cytoplasmic antibody-related vasculitis</p>
                    </list-item>
                    <list-item>
                      <p>POEMS (polyneuropathy, organomegaly, endocrinopathy, monoclonal plasma cell disorder, skin changes) syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Restless legs syndrome</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Nonepisodic angioedema associated with eosinophilia</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Eosinophilic fasciitis</p>
                    </list-item>
                    <list-item>
                      <p>Acute interstitial nephritis</p>
                    </list-item>
                    <list-item>
                      <p>Hypereosinophilic syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Churg-Strauss syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Idiopathic hypereosinophilia syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Lymphoma</p>
                    </list-item>
                    <list-item>
                      <p>Systemic lupus erythematosus</p>
                    </list-item>
                    <list-item>
                      <p>Sarcoidosis</p>
                    </list-item>
                    <list-item>
                      <p>Infectious mononucleosis</p>
                    </list-item>
                    <list-item>
                      <p>Allergic reactions</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Eosinophilic cellulitis (Wells syndrome)</p>
                    </list-item>
                    <list-item>
                      <p>Idiopathic eosinophilic oedema</p>
                    </list-item>
                    <list-item>
                      <p>Hypereosinophilic syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Eosinophilic granulomatosis with polyangiitis (Churg-Strauss syndrome)</p>
                    </list-item>
                    <list-item>
                      <p>Drug-induced eosinophilia</p>
                    </list-item>
                    <list-item>
                      <p>Eosinophilic fasciitis (Shulman syndrome)</p>
                    </list-item>
                    <list-item>
                      <p>Kimura disease</p>
                    </list-item>
                    <list-item>
                      <p>Eosinophilic gastroenteritis</p>
                    </list-item>
                    <list-item>
                      <p>Parasitic infections</p>
                    </list-item>
                    <list-item>
                      <p>Systemic lupus erythematosus</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Hypereosinophilic syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Idiopathic peripheral edema</p>
                    </list-item>
                    <list-item>
                      <p>Idiopathic angioedema</p>
                    </list-item>
                    <list-item>
                      <p>Cholinergic urticaria</p>
                    </list-item>
                    <list-item>
                      <p>Eosinophilic granulomatosis with polyangiitis</p>
                    </list-item>
                    <list-item>
                      <p>Parasitic infection</p>
                    </list-item>
                    <list-item>
                      <p>Chronic eosinophilic leukemia</p>
                    </list-item>
                    <list-item>
                      <p>Drug-induced (including supplements, herbal medicine) hypereosinphilic edema</p>
                    </list-item>
                    <list-item>
                      <p>Tuberculosis</p>
                    </list-item>
                    <list-item>
                      <p>Viral infection</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Decompression  <break/>  
            sickness</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Arterial gas embolism</p>
                    </list-item>
                    <list-item>
                      <p>Decompression sickness</p>
                    </list-item>
                    <list-item>
                      <p>Carbon monoxide poisoning</p>
                    </list-item>
                    <list-item>
                      <p>Nitrogen narcosis</p>
                    </list-item>
                    <list-item>
                      <p>Hypothermia</p>
                    </list-item>
                    <list-item>
                      <p>Acute pancreatitis</p>
                    </list-item>
                    <list-item>
                      <p>Mesenteric ischemia</p>
                    </list-item>
                    <list-item>
                      <p>Gastroenteritis</p>
                    </list-item>
                    <list-item>
                      <p>Gastric ulcer</p>
                    </list-item>
                    <list-item>
                      <p>Acute cholecystitis</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Decompression sickness</p>
                    </list-item>
                    <list-item>
                      <p>Gas embolism</p>
                    </list-item>
                    <list-item>
                      <p>Acute coronary syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Gastroenteritis</p>
                    </list-item>
                    <list-item>
                      <p>Acute pancreatitis</p>
                    </list-item>
                    <list-item>
                      <p>Peptic ulcer disease</p>
                    </list-item>
                    <list-item>
                      <p>Diabetic ketoacidosis</p>
                    </list-item>
                    <list-item>
                      <p>Acute cholecystitis</p>
                    </list-item>
                    <list-item>
                      <p>Mesenteric ischemia</p>
                    </list-item>
                    <list-item>
                      <p>Gastritis</p>
                    </list-item>
                  </list>
                </td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>Air embolization</p>
                    </list-item>
                    <list-item>
                      <p>Decompression sickness</p>
                    </list-item>
                    <list-item>
                      <p>Nonocclusive mesenteric ischemia</p>
                    </list-item>
                    <list-item>
                      <p>Diabetic ketoacidosis</p>
                    </list-item>
                    <list-item>
                      <p>Bacteremia</p>
                    </list-item>
                    <list-item>
                      <p>Bacterial translocation</p>
                    </list-item>
                    <list-item>
                      <p>Acute coronary syndrome</p>
                    </list-item>
                    <list-item>
                      <p>Cholelithiasis</p>
                    </list-item>
                    <list-item>
                      <p>Cholangitis</p>
                    </list-item>
                    <list-item>
                      <p>Cholesterol embolization</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>ChatGPT-3.5: third-generation ChatGPT.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>ChatGPT-4: fourth-generation ChatGPT.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Rates of correct diagnoses within the top 10 and top 5 differential diagnosis lists and top diagnosis generated by ChatGPT-3.5 and ChatGPT-4 compared with those created by physicians.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="130"/>
            <col width="130"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td>ChatGPT-4<sup>a</sup> (n=52), n (%)</td>
                <td>ChatGPT-3.5<sup>b</sup> (n=52), n (%)</td>
                <td>Physicians (n=52), n (%)</td>
                <td colspan="3"><italic>P</italic> value<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>ChatGPT-4 vs physicians</td>
                <td>ChatGPT-3.5 vs physicians</td>
                <td>ChatGPT-4 vs ChatGPT-3.5</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Within the top 10</td>
                <td>43 (83)</td>
                <td>38 (73)</td>
                <td>39 (75)</td>
                <td>.47</td>
                <td>&#62;.99</td>
                <td>.34</td>
              </tr>
              <tr valign="top">
                <td>Within the top 5</td>
                <td>42 (81)</td>
                <td>34 (65)</td>
                <td>35 (67)</td>
                <td>.18</td>
                <td>&#62;.99</td>
                <td>.12</td>
              </tr>
              <tr valign="top">
                <td>Top  <break/>  
            diagnosis</td>
                <td>31 (60)</td>
                <td>22 (42)</td>
                <td>26 (50)</td>
                <td>.43</td>
                <td>.56</td>
                <td>.12</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>ChatGPT-4: fourth-generation ChatGPT.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>ChatGPT-3.5: third-generation ChatGPT.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup><italic>P</italic> values from chi-square scores.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Rates of correct diagnoses within the top 10 and top 5 differential diagnosis lists and top diagnosis generated by third-generation ChatGPT and fourth-generation ChatGPT between open access and non–open access case reports and between the timing of publications prior to 2021 and published in 2022.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="70"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="0"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="70"/>
            <col width="70"/>
            <col width="70"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td colspan="7">Fourth-generation ChatGPT</td>
                <td colspan="6">Third-generation ChatGPT</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Open access (n=39), n (%)</td>
                <td>Non–open access (n=13), n (%)</td>
                <td><italic>P</italic> value<sup>a</sup></td>
                <td>Prior to 2021 (n=24), n (%)</td>
                <td>In 2022 (n=16), n (%)</td>
                <td><italic>P</italic> value<sup>b</sup></td>
                <td colspan="2">Open access (n=39), n (%)</td>
                <td>Non–open access (n=13), n (%)</td>
                <td><italic>P</italic> value<sup>a</sup></td>
                <td>Prior to 2021 (n=24), n (%)</td>
                <td>In 2022 (n=16), n (%)</td>
                <td><italic>P</italic> value<sup>b</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Within the top 10</td>
                <td>32 (82)</td>
                <td>11 (85)</td>
                <td>&#62;.99</td>
                <td>20 (83)</td>
                <td>13 (81)</td>
                <td>&#62;.99</td>
                <td colspan="2">28 (72)</td>
                <td>10 (77)</td>
                <td>&#62;.99</td>
                <td>17 (71)</td>
                <td>13 (81)</td>
                <td>.71</td>
              </tr>
              <tr valign="top">
                <td>Within the top 5</td>
                <td>31 (80)</td>
                <td>11 (85)</td>
                <td>&#62;.99</td>
                <td>19 (79)</td>
                <td>13 (81)</td>
                <td>&#62;.99</td>
                <td colspan="2">25 (64)</td>
                <td>9 (69)</td>
                <td>&#62;.99</td>
                <td>17 (71)</td>
                <td>11 (69)</td>
                <td>&#62;.99</td>
              </tr>
              <tr valign="top">
                <td>Top  <break/>  
            diagnosis</td>
                <td>22 (56)</td>
                <td>9 (69)</td>
                <td>.62</td>
                <td>17 (71)</td>
                <td>9 (56)</td>
                <td>.54</td>
                <td colspan="2">14 (36)</td>
                <td>8 (62)</td>
                <td>.19</td>
                <td>11 (46)</td>
                <td>8 (50)</td>
                <td>&#62;.99</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup><italic>P</italic> values from chi-square scores comparing open access and non–open access case reports.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup><italic>P</italic> values from chi-square scores comparing between case reports prior to 2021 and case reports published in 2022.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>This study has several main findings. First, our study demonstrates the accuracy of the differential diagnosis lists generated by ChatGPT-3.5 and ChatGPT-4 for complex clinical vignettes from case reports. The rate of correct diagnoses within the top 10 and top 5 differential diagnosis lists generated by ChatGPT-4 was &#62;80%. With a diagnostic accuracy of &#62;80%, ChatGPT-4 can serve as a supplementary tool for physicians, especially when dealing with complex cases. Our results have demonstrated that GPT possesses diagnostic capabilities that can be comparable to those of physicians. This suggests that GPT might serve as a form of collective intelligence, capable of double-checking clinical diagnoses conducted by medical practitioners, at the very least. Second, there were no statistically significant differences in the rates of correct diagnoses by ChatGPT-3.5 and ChatGPT-4 based on the open-access status or the publication date. Both GPT-3.5 and GPT-4 models were constructed using publicly available databases and the knowledge cutoffs set in early 2021 [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Therefore, we hypothesized that open access case reports could produce better diagnostic results than non–open access ones. Additionally, we postulated that the case reports published in the years prior to 2021 could produce better diagnostic results than the ones published in 2022. The actual results were partly attributed to the limited sample size resulting from the subdivision into exploratory analysis.</p>
      </sec>
      <sec>
        <title>Potential Implications for Clinical Practice and Medical Education</title>
        <p>The integration of generative AI like ChatGPT into clinical settings could enhance patient care and streamline physician workflows. Given its pretraining accuracy of over 80%, physicians could receive immediate support in challenging cases, thereby minimizing diagnostic errors and enhancing patient outcomes. Furthermore, these AI systems could grant health care professionals more time for the demanding facets of patient care, allowing them to focus on more demanding aspects of patient care and potentially thereby improving health care efficiency. In an educational context, ChatGPT could be pivotal in shaping future physicians, especially in clinical reasoning and medical knowledge acquisition [<xref ref-type="bibr" rid="ref27">27</xref>]. Engaging with generative AIs can expose medical learners to an array of diagnoses, preparing them for complex clinical situations.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations. First, the study materials were obtained solely from complex case reports published by a single GIM department at a single center. Although these case reports provided insight into challenging diagnostic scenarios, they may not capture the full spectrum of patient presentations, even within the GIM department, as they were not randomly sampled but rather selected for their complexity, unusualness, or the challenges they posed for diagnosis. Therefore, our findings have limited external validity, as they may not be generalizable to other settings. Their performance might differ in simpler or more typical clinical presentations. Second, we acknowledge the possible bias in the differential diagnosis lists. They were created by experienced GIM physicians, implying that the results might not be applicable to lists created by physicians of different specialties or with various levels of training. It would be beneficial if future studies incorporated a wider array of participants. Third, there is a limitation associated with the accessibility and recency of our study. Specifically, 75% (39/52) of the case studies were published as open access, and approximately half of the case studies were published prior to 2021. Although we did not observe statistically significant differences regarding open access and publication timing, there were some possibilities for ChatGPT-3.5, ChatGPT-4, and physicians who created differential diagnosis lists to learn these case materials directly or indirectly. The final limitation pertains to possible time lag when generating differential diagnosis lists between ChatGPT-3.5 and ChatGPT-4. In light of these limitations, future research should assess the diagnostic accuracy of ChatGPT models by using properly tuned case materials that the model has not been trained on.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>Our previous study [<xref ref-type="bibr" rid="ref22">22</xref>] showed that the diagnostic accuracy of ChatGPT-3.5 was lower than that of physicians (25/30, 83% vs 59/60, 98%, respectively). In contrast, the findings of this study revealed that the rates of correct diagnoses within the top 10 (43/52, 83% vs 39/52, 75%, respectively) and top 5 (42/52, 81% vs 35/52, 67%, respectively) differential diagnosis lists, as well as the top diagnosis (31/52, 60% vs 26/52, 50%, respectively) generated by ChatGPT-4 were comparable to those by physicians. These results suggest the evolving performance of AI chatbots across different ChatGPT versions. Compared with those in the prior study [<xref ref-type="bibr" rid="ref22">22</xref>], the rates of correct diagnoses within the top 10 (38/52, 73% vs 28/30, 93%, respectively) and top 5 (34/52, 65% vs 25/30, 83%, respectively) differential diagnosis lists and top diagnosis (22/52, 42% vs 16/30, 53%, respectively) generated by ChatGPT-3 (or 3.5) were lower in this study. This discrepancy was largely attributed to this study’s emphasis on complex clinical case vignettes sourced from case reports within the GIM department, while the prior research focused on more common clinical presentations. Moreover, ChatGPT-4 provided better results in its differential diagnosis lists (43/52, 83% vs 45/70, 64%, respectively) and as its top diagnosis (31/52, 60% vs 27/70, 39%, respectively) compared with those reported in another study for New England Journal of Medicine clinicopathologic conferences [<xref ref-type="bibr" rid="ref21">21</xref>]. These variations can be partly ascribed to differences in the study designs, including case vignettes and systems.</p>
        <p>Compared with a previous review on symptom checkers [<xref ref-type="bibr" rid="ref5">5</xref>], the rate of correct diagnoses within the top 10 differential diagnoses generated by ChatGPT-4 was higher (43/52, 83% vs 60.9%-76.9%, respectively) in this study. Compared with a previous review on the differential diagnosis generator [<xref ref-type="bibr" rid="ref6">6</xref>], the rate of correct diagnoses within the top 10 differential diagnoses generated by ChatGPT-4 was higher (43/52, 83% vs 63%-77%, respectively) in this study. This discrepancy is partly due to differences in study designs, case materials, and algorithms. In the future, direct comparisons between ChatGPT and other CDS systems are required.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study demonstrates the potential diagnostic accuracy of the differential diagnosis lists generated by ChatGPT-3.5 and ChatGPT-4 by using complex clinical vignettes from case reports published by the GIM department. Notably, the rate of correct diagnoses within the top 10 and top 5 differential diagnosis lists generated by ChatGPT-4 exceeds 80%. Although these results stem from a limited data set of case reports from a single department, they indicate the potential utility of ChatGPT-4 as a supplementary tool for physicians, particularly for those affiliated with the GIM department. Future research should assess the diagnostic accuracy of ChatGPT models by using properly tuned case materials that the model has not been trained on. Additionally, future investigations should evaluate the literacy level of AIs and their alignment with relevant medical text. Such efforts will ensure a comprehensive insight into the AI’s possible roles in enhancing clinical decision-making processes. Moreover, as AI systems become more prevalent, their influence is expected to ripple across various facets of health care. Generative AIs have the potential to reshape patient-physician dynamics, fostering more informed interactions. They can also play a pivotal role in democratizing medical knowledge. This could lead to heightened health care accessibility, allowing even those in remote or underserved regions to glean expert medical advice. Given these profound implications, it becomes imperative to investigate the ramifications of AI integration into health care.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Case reports included in this study.</p>
        <media xlink:href="medinform_v11i1e48808_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 52 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Final diagnosis and the differential diagnosis lists generated by ChatGPT and those created by physicians.</p>
        <media xlink:href="medinform_v11i1e48808_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 203 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CDS</term>
          <def>
            <p>clinical decision support</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GIM</term>
          <def>
            <p>general internal medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">GPT</term>
          <def>
            <p>generative pretrained transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">GPT-3.5</term>
          <def>
            <p>third-generation generative pretrained transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GPT-4</term>
          <def>
            <p>fourth-generation generative pretrained transformer</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="con">
        <p>TH, RK, YH, KM, KT, YK, T Suzuki, and T Shimizu contributed to the study concept and design. TH performed the statistical analyses. TH contributed to the drafting of the manuscript. RK, YH, KM, KT, YK, T Suzuki, and T Shimizu contributed to the critical revision of the manuscript for relevant intellectual content. All the authors have read and approved the final version of the manuscript. We would like to specially thank Dr Kenjiro Kakimoto, Department of Psychiatry, Nihon University School of Medicine, for helping us with the analysis. This study was conducted using resources from the Department of Diagnostics and Generalist Medicine at Dokkyo Medical University.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Holmboe</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Durning</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Assessing clinical reasoning: moving from in vitro to in vivo</article-title>
          <source>Diagnosis (Berl)</source>
          <year>2014</year>
          <month>01</month>
          <day>01</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>111</fpage>
          <lpage>117</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.degruyter.com/document/doi/10.1515/dx-2013-0029"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/dx-2013-0029</pub-id>
          <pub-id pub-id-type="medline">29539977</pub-id>
          <pub-id pub-id-type="pii">/j/dx.2014.1.issue-1/dx-2013-0029/dx-2013-0029.xml</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Otaka</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Katsukura</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Effect of contextual factors on the prevalence of diagnostic errors among patients managed by physicians of the same specialty: a single-centre retrospective observational study</article-title>
          <source>BMJ Qual Saf</source>
          <year>2023</year>
          <month>01</month>
          <day>23</day>
          <fpage>bmjqs-2022-015436</fpage>
          <pub-id pub-id-type="doi">10.1136/bmjqs-2022-015436</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skinner</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic errors in older patients: a systematic review of incidence and potential causes in seven prevalent diseases</article-title>
          <source>IJGM</source>
          <year>2016</year>
          <month>05</month>
          <fpage>137</fpage>
          <lpage>46</lpage>
          <pub-id pub-id-type="doi">10.2147/ijgm.s96741</pub-id>
          <pub-id pub-id-type="medline">27284262</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>Committee on Diagnostic Error in Health Care</collab>
            <collab>Board on Health Care Services</collab>
            <name name-style="western">
              <surname>Balogh</surname>
              <given-names>EP</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>BT</given-names>
            </name>
          </person-group>
          <article-title>Technology and tools in the diagnostic process</article-title>
          <source>Improving Diagnosis in Health Care</source>
          <year>2015</year>
          <month>12</month>
          <day>29</day>
          <publisher-loc>Washington DC</publisher-loc>
          <publisher-name>National Academies Press (US)</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schmieding</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Kopka</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schulz-Niethammer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Balzer</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Feufel</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Triage accuracy of symptom checker apps: 5-year follow-up evaluation</article-title>
          <source>J Med Internet Res</source>
          <year>2022</year>
          <month>05</month>
          <day>10</day>
          <volume>24</volume>
          <issue>5</issue>
          <fpage>e31810</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2022/5/e31810/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/31810</pub-id>
          <pub-id pub-id-type="medline">35536633</pub-id>
          <pub-id pub-id-type="pii">v24i5e31810</pub-id>
          <pub-id pub-id-type="pmcid">PMC9131144</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Riches</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Panagioti</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alam</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cheraghi-Sohi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Esmail</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bower</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>The effectiveness of electronic differential diagnoses (ddx) generators: a systematic review and meta-analysis</article-title>
          <source>PLoS One</source>
          <year>2016</year>
          <volume>11</volume>
          <issue>3</issue>
          <fpage>e0148991</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0148991"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0148991</pub-id>
          <pub-id pub-id-type="medline">26954234</pub-id>
          <pub-id pub-id-type="pii">PONE-D-15-38539</pub-id>
          <pub-id pub-id-type="pmcid">PMC4782994</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Greenes</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Chapter 2 - A brief history of clinical decision support: technical, social, cultural, economic, governmental perspectives</article-title>
          <source>Clinical Decision Support (Second Edition)</source>
          <year>2014</year>
          <month>03</month>
          <day>28</day>
          <publisher-loc>London, UK</publisher-loc>
          <publisher-name>Academic Press</publisher-name>
          <fpage>49</fpage>
          <lpage>109</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kawamura</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Harada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sugimoto</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nagase</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Katsukura</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Incidence of diagnostic errors among unexpectedly hospitalized patients using an automated medical history-taking system with a differential diagnosis generator: retrospective observational study</article-title>
          <source>JMIR Med Inform</source>
          <year>2022</year>
          <month>01</month>
          <day>27</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>e35225</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2022/1/e35225/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/35225</pub-id>
          <pub-id pub-id-type="medline">35084347</pub-id>
          <pub-id pub-id-type="pii">v10i1e35225</pub-id>
          <pub-id pub-id-type="pmcid">PMC8832260</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meunier</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Raynaud</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Guimaraes</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Gueyffier</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Letrilliart</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Barriers and facilitators to the use of clinical decision support systems in primary care: a mixed-methods systematic review</article-title>
          <source>Ann Fam Med</source>
          <year>2023</year>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>57</fpage>
          <lpage>69</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.annfammed.org/cgi/pmidlookup?view=long&#38;pmid=36690490"/>
          </comment>
          <pub-id pub-id-type="doi">10.1370/afm.2908</pub-id>
          <pub-id pub-id-type="medline">36690490</pub-id>
          <pub-id pub-id-type="pii">21/1/57</pub-id>
          <pub-id pub-id-type="pmcid">PMC9870646</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wani</surname>
              <given-names>SUD</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>NA</given-names>
            </name>
            <name name-style="western">
              <surname>Thakur</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gautam</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alam</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Alshehri</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ghoneim</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Shakeel</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Utilization of artificial intelligence in disease prevention: diagnosis, treatment, and implications for the healthcare workforce</article-title>
          <source>Healthcare (Basel)</source>
          <year>2022</year>
          <month>03</month>
          <day>24</day>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>608</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare10040608"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare10040608</pub-id>
          <pub-id pub-id-type="medline">35455786</pub-id>
          <pub-id pub-id-type="pii">healthcare10040608</pub-id>
          <pub-id pub-id-type="pmcid">PMC9026833</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haug</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Drazen</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence and machine learning in clinical medicine, 2023</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <month>03</month>
          <day>30</day>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1201</fpage>
          <lpage>1208</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmra2302038</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>No authors listed</collab>
          </person-group>
          <article-title>Will ChatGPT transform healthcare?</article-title>
          <source>Nat Med</source>
          <year>2023</year>
          <month>03</month>
          <volume>29</volume>
          <issue>3</issue>
          <fpage>505</fpage>
          <lpage>506</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-023-02289-5</pub-id>
          <pub-id pub-id-type="medline">36918736</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-023-02289-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Biswas</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and the future of medical writing</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>04</month>
          <volume>307</volume>
          <issue>2</issue>
          <fpage>e223312</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.223312</pub-id>
          <pub-id pub-id-type="medline">36728748</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Curtis</surname>
              <given-names>N</given-names>
            </name>
            <collab>ChatGPT</collab>
          </person-group>
          <article-title>To ChatGPT or not to ChatGPT? The impact of artificial intelligence on academic publishing</article-title>
          <source>Pediatr Infect Dis J</source>
          <year>2023</year>
          <month>04</month>
          <day>01</day>
          <volume>42</volume>
          <issue>4</issue>
          <fpage>275</fpage>
          <pub-id pub-id-type="doi">10.1097/INF.0000000000003852</pub-id>
          <pub-id pub-id-type="medline">36757192</pub-id>
          <pub-id pub-id-type="pii">00006454-202304000-00001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>Camille</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: Potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
          </person-group>
          <article-title>GPT-4 technical report</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on March 15, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ui.adsabs.harvard.edu/abs/2023arXiv230308774O"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaishya</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Misra</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vaish</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: Is this version good for healthcare and research?</article-title>
          <source>Diabetes Metab Syndr</source>
          <year>2023</year>
          <month>04</month>
          <volume>17</volume>
          <issue>4</issue>
          <fpage>102744</fpage>
          <pub-id pub-id-type="doi">10.1016/j.dsx.2023.102744</pub-id>
          <pub-id pub-id-type="medline">36989584</pub-id>
          <pub-id pub-id-type="pii">S1871-4021(23)00040-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bubeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Petro</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <month>03</month>
          <day>30</day>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1233</fpage>
          <lpage>1239</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmsr2214184</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhan</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in scientific writing: A cautionary tale</article-title>
          <source>Am J Med</source>
          <year>2023</year>
          <month>08</month>
          <volume>136</volume>
          <issue>8</issue>
          <fpage>725</fpage>
          <lpage>726.e6</lpage>
          <pub-id pub-id-type="doi">10.1016/j.amjmed.2023.02.011</pub-id>
          <pub-id pub-id-type="medline">36906169</pub-id>
          <pub-id pub-id-type="pii">S0002-9343(23)00159-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Dhaliwal</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Decoding artificial intelligence to achieve diagnostic excellence: learning from experts, examples, and experience</article-title>
          <source>JAMA</source>
          <year>2022</year>
          <month>08</month>
          <day>23</day>
          <volume>328</volume>
          <issue>8</issue>
          <fpage>709</fpage>
          <lpage>710</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2022.13735</pub-id>
          <pub-id pub-id-type="medline">35913752</pub-id>
          <pub-id pub-id-type="pii">2794998</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kanjee</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Crowe</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rodman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <month>07</month>
          <day>03</day>
          <volume>330</volume>
          <issue>1</issue>
          <fpage>78</fpage>
          <lpage>80</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id>
          <pub-id pub-id-type="medline">37318797</pub-id>
          <pub-id pub-id-type="pii">2806457</pub-id>
          <pub-id pub-id-type="pmcid">PMC10273128</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirosawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Harada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yokose</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sakamoto</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kawamura</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic accuracy of differential-diagnosis lists generated by generative pretrained transformer 3 chatbot for clinical vignettes with common chief complaints: a pilot study</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2023</year>
          <month>02</month>
          <day>15</day>
          <volume>20</volume>
          <issue>4</issue>
          <fpage>3378</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=ijerph20043378"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph20043378</pub-id>
          <pub-id pub-id-type="medline">36834073</pub-id>
          <pub-id pub-id-type="pii">ijerph20043378</pub-id>
          <pub-id pub-id-type="pmcid">PMC9967747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jinno</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hirosawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Hepatic portal venous gas after diving</article-title>
          <source>BMJ Case Reports</source>
          <year>2018</year>
          <month>01</month>
          <day>12</day>
          <fpage>bcr-2017-223844</fpage>
          <pub-id pub-id-type="doi">10.1136/bcr-2017-223844</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krupat</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wormwood</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartzstein</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Richards</surname>
              <given-names>JB</given-names>
            </name>
          </person-group>
          <article-title>Avoiding premature closure and reaching diagnostic accuracy: some key predictive factors</article-title>
          <source>Med Educ</source>
          <year>2017</year>
          <month>11</month>
          <volume>51</volume>
          <issue>11</issue>
          <fpage>1127</fpage>
          <lpage>1137</lpage>
          <pub-id pub-id-type="doi">10.1111/medu.13382</pub-id>
          <pub-id pub-id-type="medline">28857266</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Armstrong</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>When to use the Bonferroni correction</article-title>
          <source>Ophthalmic Physiol Opt</source>
          <year>2014</year>
          <month>09</month>
          <volume>34</volume>
          <issue>5</issue>
          <fpage>502</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1111/opo.12131</pub-id>
          <pub-id pub-id-type="medline">24697967</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zong</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Krishnamachari</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A survey on GPT-3</article-title>
          <source>arXiv</source>
          <comment>Preprint posted on December 1, 2022</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2212.00857</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirosawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Enhancing clinical reasoning with chat generative pre-trained transformer: A practical guide</article-title>
          <source>Diagnosis</source>
          <year>2023</year>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>A</fpage>
          <pub-id pub-id-type="doi">10.1515/dx-2023-0116</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
