<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v12i1e55799</article-id>
      <article-id pub-id-type="pmid">39018102</article-id>
      <article-id pub-id-type="doi">10.2196/55799</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluating Large Language Models for Automated Reporting and Data Systems Categorization: Cross-Sectional Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Lovis</surname>
            <given-names>Christian</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Liu</surname>
            <given-names>Zhenyu</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Bu</surname>
            <given-names>Dechao</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sure</surname>
            <given-names>Tharun Anand Reddy</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Nuthakki</surname>
            <given-names>Siddhartha</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhu</surname>
            <given-names>Lingxuan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Wu</surname>
            <given-names>Qingxia</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4888-8830</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Wu</surname>
            <given-names>Qingxia</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6214-8033</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Huali</given-names>
          </name>
          <degrees>MM</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-9122-9018</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Yan</given-names>
          </name>
          <degrees>MM</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2316-772X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Bai</surname>
            <given-names>Yan</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2421-4129</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Wu</surname>
            <given-names>Yaping</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2979-5332</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Xuan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-4832-4034</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Xiaodong</given-names>
          </name>
          <degrees>MM</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8467-9679</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Dong</surname>
            <given-names>Pei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7204-0621</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author">
          <name name-style="western">
            <surname>Xue</surname>
            <given-names>Jon</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3207-0890</ext-link>
        </contrib>
        <contrib id="contrib11" contrib-type="author">
          <name name-style="western">
            <surname>Shen</surname>
            <given-names>Dinggang</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7934-5698</ext-link>
        </contrib>
        <contrib id="contrib12" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Meiyun</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Medical Imaging</institution>
            <institution>Henan Provincial People’s Hospital &amp; People’s Hospital of Zhengzhou University</institution>
            <addr-line>No 7, Weiwu Road, Jinshui District</addr-line>
            <addr-line>Zhengzhou, 450001</addr-line>
            <country>China</country>
            <phone>86 037165580267</phone>
            <email>mywang@zzu.edu.cn</email>
          </address>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7163-2617</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Medical Imaging</institution>
        <institution>Henan Provincial People’s Hospital &amp; People’s Hospital of Zhengzhou University</institution>
        <addr-line>Zhengzhou</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Research Intelligence Department</institution>
        <institution>Beijing United Imaging Research Institute of Intelligent Imaging</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Research and Collaboration</institution>
        <institution>United Imaging Intelligence (Beijing) Co, Ltd</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Radiology</institution>
        <institution>Luoyang Central Hospital</institution>
        <addr-line>Luoyang</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Research and Collaboration</institution>
        <institution>Shanghai United Imaging Intelligence Co, Ltd</institution>
        <addr-line>Shanghai</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>School of Biomedical Engineering</institution>
        <institution>Shanghai Tech University</institution>
        <addr-line>Shanghai</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Biomedical Research Institute</institution>
        <institution>Henan Academy of Sciences</institution>
        <addr-line>Zhengzhou</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Meiyun Wang <email>mywang@zzu.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>7</month>
        <year>2024</year>
      </pub-date>
      <volume>12</volume>
      <elocation-id>e55799</elocation-id>
      <history>
        <date date-type="received">
          <day>25</day>
          <month>12</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>14</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>2</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>5</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Qingxia Wu, Qingxia Wu, Huali Li, Yan Wang, Yan Bai, Yaping Wu, Xuan Yu, Xiaodong Li, Pei Dong, Jon Xue, Dinggang Shen, Meiyun Wang. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 17.07.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2024/1/e55799" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models show promise for improving radiology workflows, but their performance on structured radiological tasks such as Reporting and Data Systems (RADS) categorization remains unexplored.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to evaluate 3 large language model chatbots—Claude-2, GPT-3.5, and GPT-4—on assigning RADS categories to radiology reports and assess the impact of different prompting strategies.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>This cross-sectional study compared 3 chatbots using 30 radiology reports (10 per RADS criteria), using a 3-level prompting strategy: zero-shot, few-shot, and guideline PDF-informed prompts. The cases were grounded in Liver Imaging Reporting &amp; Data System (LI-RADS) version 2018, Lung CT (computed tomography) Screening Reporting &amp; Data System (Lung-RADS) version 2022, and Ovarian-Adnexal Reporting &amp; Data System (O-RADS) magnetic resonance imaging, meticulously prepared by board-certified radiologists. Each report underwent 6 assessments. Two blinded reviewers assessed the chatbots’ response at patient-level RADS categorization and overall ratings. The agreement across repetitions was assessed using Fleiss κ.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Claude-2 achieved the highest accuracy in overall ratings with few-shot prompts and guideline PDFs (prompt-2), attaining 57% (17/30) average accuracy over 6 runs and 50% (15/30) accuracy with k-pass voting. Without prompt engineering, all chatbots performed poorly. The introduction of a structured exemplar prompt (prompt-1) increased the accuracy of overall ratings for all chatbots. Providing prompt-2 further improved Claude-2’s performance, an enhancement not replicated by GPT-4. The interrun agreement was substantial for Claude-2 (k=0.66 for overall rating and k=0.69 for RADS categorization), fair for GPT-4 (k=0.39 for both), and fair for GPT-3.5 (k=0.21 for overall rating and k=0.39 for RADS categorization). All chatbots showed significantly higher accuracy with LI-RADS version 2018 than with Lung-RADS version 2022 and O-RADS (<italic>P</italic>&lt;.05); with prompt-2, Claude-2 achieved the highest overall rating accuracy of 75% (45/60) in LI-RADS version 2018.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>When equipped with structured prompts and guideline PDFs, Claude-2 demonstrated potential in assigning RADS categories to radiology cases according to established criteria such as LI-RADS version 2018. However, the current generation of chatbots lags in accurately categorizing cases based on more recent RADS criteria.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>Radiology Reporting and Data Systems</kwd>
        <kwd>LI-RADS</kwd>
        <kwd>Lung-RADS</kwd>
        <kwd>O-RADS</kwd>
        <kwd>large language model</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>chatbot</kwd>
        <kwd>chatbots</kwd>
        <kwd>categorization</kwd>
        <kwd>recommendation</kwd>
        <kwd>recommendations</kwd>
        <kwd>accuracy</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Since ChatGPT’s public release in November 2022, large language models (LLMs) have attracted great interest in medical imaging applications [<xref ref-type="bibr" rid="ref1">1</xref>]. Research indicated that ChatGPT showed promising applications in various aspects of the medical imaging process. Even without radiology-specific pretraining, LLMs can pass board examinations [<xref ref-type="bibr" rid="ref2">2</xref>], provide radiology decision support [<xref ref-type="bibr" rid="ref3">3</xref>], assist in differential diagnosis [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>], and generate impressions from findings or structured reports [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. These applications not only accelerate the imaging diagnosis process and alleviate the workload of doctors but also improve the accuracy of diagnosis [<xref ref-type="bibr" rid="ref10">10</xref>]. However, limitations exist, with 1 study showing ChatGPT-3 producing erroneous answers for a third of daily clinical questions and about 63% of provided references were not found [<xref ref-type="bibr" rid="ref11">11</xref>]. ChatGPT’s dangerous tendency to produce inaccurate responses is less frequent in GPT-4 but still limits usability in medical education and practice at present [<xref ref-type="bibr" rid="ref12">12</xref>]. Tailoring LLMs to radiology may enhance reliability, as an appropriateness criteria context aware chatbot outperformed generic chatbots and radiologists [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
      <p>The American College of Radiology Reporting and Data Systems (RADS) standardizes communication of imaging findings. As of August 2023, there have been 9 disease-specific systems endorsed by the American College of Radiology, referring to products from the lexicons to report templates [<xref ref-type="bibr" rid="ref13">13</xref>]. RADS reduces terminology variability, facilitates communication between radiologists and referring physicians, allows consistent evaluations, and conveys clinical significance to improve care. However, complexity and unfamiliarity limit adoption. Consequently, endeavors should be pursued to broaden the implementation of RADS. Therefore, we conducted this study to evaluate LLM’s capabilities on a focused RADS assignment task for radiology reports.</p>
      <p>A prompt serves as a directive or instruction given to LLMs to generate a particular response. The technique of “prompt tuning” has emerged as a valuable approach to refine the performance of LLMs, particularly for specific domains or tasks [<xref ref-type="bibr" rid="ref14">14</xref>]. By providing structured queries or exemplary responses, the output of chatbots can be tailored for accurate and relevant answers. Such prompt-tuning strategies leverage LLMs’ knowledge while guiding appropriate delivery for particular challenges [<xref ref-type="bibr" rid="ref14">14</xref>]. Given the complexity and specificity of the RADS categorization, our investigation emphasizes different prompt impacts to assess chatbot capabilities and potential performance enhancement through refined prompting tuning.</p>
      <p>In this study, our primary objective was to meticulously evaluate the performance of 3 LLMs (GPT-3.5, GPT-4, and Claude-2) for RADS categorization using different prompt-tuning strategies. We aimed to test their accuracy and consistency in RADS categorization and shed light on the potential benefits and limitations of relying on chatbot-derived information for the categorization of specific RADS.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethical Considerations</title>
        <p>As the study was based on radiological data that were artificially generated by radiologists and did not involve the participation of human subjects, the study was determined to be exempt from ethical review, in accordance with the regulations established by the institutional review board of Henan Provincial People’s Hospital.</p>
      </sec>
      <sec>
        <title>Study Design</title>
        <p>The workflow of the study is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. We conducted a cross-sectional analysis in September 2023 to evaluate the competency of 3 chatbots—GPT-3.5, GPT-4 (OpenAI, August 30, 2023 version) [<xref ref-type="bibr" rid="ref15">15</xref>], and Claude-2 (Anthropic) [<xref ref-type="bibr" rid="ref16">16</xref>]—in the task of assigning 3 RADS categorizations to radiology reports. Given the chatbot’s knowledge cessation was as of September 2021, we opted for Liver Imaging Reporting &amp; Data System (LI-RADS) version 2018 [<xref ref-type="bibr" rid="ref17">17</xref>], Lung CT (computed tomography) Screening Reporting &amp; Data System (Lung-RADS) version 2022 [<xref ref-type="bibr" rid="ref18">18</xref>], and Ovarian-Adnexal Reporting &amp; Data System (O-RADS) magnetic resonance imaging (MRI) (developed in 2022) [<xref ref-type="bibr" rid="ref19">19</xref>] as the yardsticks to compare the responses engendered by GPT-3.5, GPT-4, and Claude-2. A total of thirty radiology reports for either CT or MRI examinations were composed for this analysis, with 10 cases representing each of the 3 RADS reporting systems. The radiology reports used for testing were generated by radiologists with more than 10 years’ experience to correct the wording styles from real-life cases based on respective RADS systems. For each RADS (ie, LI-, Lung-, and O-RADS), we attempted to reflect the complexity and diversity so that the reports cover typical cases in clinical practice. Therefore, reports with 2-3 simple cases and 7-8 challenging cases were generated for 1 RADS. These include scenarios such as prior examination comparison, the presence of multiple nodules, extensive categorization under different RADS systems, and updates from the most recent LI-RADS and Lung-RADS guidelines. The characteristics of radiology reports for each RADS and the distribution of the number of the reports across the 3 RADS are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The objective was to evaluate the performance of chatbots on a highly structured radiology workflow task involving cancer risk categorization based on structured report inputs. The study design focused on a defined use case to illuminate the strengths and limitations of existing natural language-processing technology in this radiology subdomain.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Flowchart of the study design. CT: computed tomography; LI-RADS: Liver Imaging Reporting &amp; Data System; Lung-RADS: Lung CT Screening Reporting &amp; Data System; MRI: magnetic resonance imaging; O-RADS: Ovarian-Adnexal Reporting &amp; Data System; RADS: Reporting and Data Systems.</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e55799_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Prompts</title>
        <p>We collected and analyzed responses from GPT-3.5, GPT-4, and Claude-2 for each case. To mitigate bias, the radiological findings were presented individually via separate interactions, with corresponding responses saved for analysis. Three prompt templates were designed to elicit each RADS categorization along with explanatory rationale: Prompt-0 was a zero-shot prompt, merely introducing the RADS assignment task, such as “Your task is to follow Lung-RADS version 2022 guideline to give Lung-RADS category of the radiological findings delimited by angle brackets.”</p>
        <p>Prompt-1 was a few-shot prompt, furnishing an exemplar of RADS categorization including the reasoning, summarized impression, and final category. The following is an example:</p>
        <disp-quote>
          <p>Your task is to follow Lung-RADS version 2022 guideline to give Lung-RADS category of the radiological findings delimited by angle brackets. “”“ &lt; …Radiological Findings… &gt; Answer：Rationale: {…} Overall: {…} Summary: {…} Lung-RADS Category: X ”“”</p>
        </disp-quote>
        <p>Prompt-2 distinctly instructed chatbots to consult the PDF of corresponding RADS guidelines, compensating for these chatbots’ lack of radiology-specific pretraining. For Claude-2, the PDF could be directly ingested, while GPT-4 required the use of an “Ask for PDF” plug-in to extract pertinent information [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>Each case was evaluated 6 times with each chatbot across the 3 prompt levels. The representative radiological reports and prompts are shown in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. The links to all the prompts and guideline PDFs are shown in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
      </sec>
      <sec>
        <title>Evaluation of Chatbots</title>
        <p>Two study authors (QW and HL) independently evaluated the following for each chatbot response in a blinded manner, with any discrepancies resolved by a third senior radiologist (YW). The following were assessed for each response:</p>
        <list list-type="order">
          <list-item>
            <p>Patient-level RADS categorization: judged as correct, incorrect, or unsure. “Correct” denotes that the chatbot accurately identified the patient-level RADS category, irrespective of the rationale provided. “Unsure” denotes that the chatbot’s response failed to provide a decisive RADS category. For example, a response articulating that “a definitive Lung-RADS category cannot be assigned” would be categorized as “unsure.”</p>
          </list-item>
          <list-item>
            <p>Overall rating: assessed as either correct or incorrect. A response is judged incorrect if any errors (Es) are identified, including the following:</p>
            <list>
              <list-item>
                <p>E1: a factual extraction error that denotes the chatbots’ inability to paraphrase the radiological findings accurately, consequently misinterpreting the information.</p>
              </list-item>
              <list-item>
                <p>E2: hallucination, encompassing the fabrication of nonexistent RADS categories (E2a) and RADS criteria (E2b).</p>
              </list-item>
              <list-item>
                <p>E3: a reasoning error, which includes the incapacity to logically interpret the imaging description (E3a) and the RADS category accurately (E3b). The subtype errors for reasoning imaging description include the inability to reason lesion signal (E3ai), lesion size (E3aii), and enhancement (E3aiii) accurately.</p>
              </list-item>
              <list-item>
                <p>E4: an explanatory error, encompassing inaccurate elucidation of RADS category meaning (E4a) and erroneous explanation of the recommended management and follow-up corresponding to the RADS category (E4b).</p>
              </list-item>
            </list>
          </list-item>
        </list>
        <p>If a chatbot’s feedback manifested any of the aforementioned infractions, it was labeled as incorrect, with the specific type of error documented. To assess the consistency of the evaluations, a k-pass voting method was also applied. Specifically, a case was deemed accurately categorized if it met the criteria in a minimum of 4 out of the 6 runs.</p>
      </sec>
      <sec>
        <title>Statistical Analyses</title>
        <p>The accuracy of the patient-level RADS categorization and overall rating for each chatbot was compared using the chi-square test. The agreement across the 6 repeated runs was assessed using Fleiss κ. Agreement strength was interpreted as follows: &lt;0 signified poor, 0-0.20 indicated slight, 0.21-0.40 represented fair, 0.41-0.60 was interpreted as moderate, 0.61-0.80 denoted substantial, and 0.81-1 was characterized as almost perfect. Statistical significance was defined as 2-sided <italic>P</italic>&lt;.05. All analyses were performed using R statistical software (version 4.1.2; The R Foundation).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Performance of Chatbots</title>
        <p>The performance of chatbots is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref> and <xref ref-type="table" rid="table1">Tables 1</xref> and <xref ref-type="table" rid="table2">2</xref>, with the links to case-level details provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. For the overall rating (<xref ref-type="table" rid="table1">Table 1</xref>, average row and <xref rid="figure2" ref-type="fig">Figure 2</xref>A), Claude-2 with prompt-2 demonstrated significantly higher average accuracy across the 30 cases than Claude-2 with prompt-0 (odds ratio [OR] 8.16; <italic>P</italic>&lt;.001). GPT-4 with prompt-2 also showed improved average accuracy compared with GPT-4 with prompt-0, but the difference was not statistically significant (OR 3.19; <italic>P</italic>=.13). When using the k-pass voting method (<xref ref-type="table" rid="table1">Table 1</xref>, k-pass voting row), Claude-2 with prompt-2 had significantly higher accuracy than Claude-2 with prompt-0 (OR 8.65; <italic>P</italic>=.002). Similarly, GPT-4 with prompt-2 was significantly more accurate than GPT-4 with prompt-0 (OR 11.98; <italic>P</italic>=.01). For the exact assignment of the patient-level RADS categorization (<xref ref-type="table" rid="table2">Table 2</xref>, average row and <xref rid="figure2" ref-type="fig">Figure 2</xref>B), Claude-2 with Prompt-2 showed significantly more average accuracy than Claude-2 with prompt-0 (<italic>P</italic>=.04).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Bar graphs show the comparison of chatbot performance across 6 runs regarding (A) overall rating and (B) patient-level Reporting and Data Systems categorization.</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e55799_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Correct overall ratings of different chatbots and prompts.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="250"/>
            <col width="0"/>
            <col width="240"/>
            <col width="240"/>
            <col width="0"/>
            <col width="240"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Chatbots and prompts</td>
                <td>Prompt-0, n (%; 95% CI)</td>
                <td colspan="2">Prompt-1, n (%; 95% CI)</td>
                <td>Prompt-2, n (%; 95% CI)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="7">
                  <bold>GPT-3.5</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 1</td>
                <td colspan="2">3 (10; 3-28)</td>
                <td>9 (30; 15-50)</td>
                <td colspan="2">N/A<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 2</td>
                <td colspan="2">3 (10; 3-28)</td>
                <td>9 (30; 15-50)</td>
                <td colspan="2">N/A</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 3</td>
                <td colspan="2">4 (13; 4-32)</td>
                <td>7 (23; 11-43)</td>
                <td colspan="2">N/A</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 4</td>
                <td colspan="2">4 (13; 4-32)</td>
                <td>5 (17; 6-35)</td>
                <td colspan="2">N/A</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 5</td>
                <td colspan="2">3 (10; 3-28)</td>
                <td>6 (20; 8-39)</td>
                <td colspan="2">N/A</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 6</td>
                <td colspan="2">3 (10; 3-28)</td>
                <td>4 (13; 4-32)</td>
                <td colspan="2">N/A</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average<sup>b</sup></td>
                <td colspan="2">3 (10; 3-28)</td>
                <td>7 (23; 11-43)</td>
                <td colspan="2">N/A</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>K-pass voting<sup>c</sup></td>
                <td colspan="2">1 (3; 0-19)</td>
                <td>2 (7; 1-24)</td>
                <td colspan="2">N/A</td>
              </tr>
              <tr valign="top">
                <td colspan="7">
                  <bold>GPT-4</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 1</td>
                <td colspan="2">4 (13; 4-32)</td>
                <td>11 (37; 21-56)</td>
                <td colspan="2">12 (40; 23-59)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 2</td>
                <td colspan="2">4 (13; 4-32)</td>
                <td>7 (23; 11-43)</td>
                <td colspan="2">8 (27; 13-46)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 3</td>
                <td colspan="2">4 (13; 4-32)</td>
                <td>9 (30; 15-50)</td>
                <td colspan="2">9 (30; 15-50)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 4</td>
                <td colspan="2">2 (7; 1-24)</td>
                <td>9 (30; 15-50)</td>
                <td colspan="2">13 (43; 26-62)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 5</td>
                <td colspan="2">5 (17; 6-35)</td>
                <td>11 (37; 21-56)</td>
                <td colspan="2">8 (27; 13-46)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 6</td>
                <td colspan="2">6 (20; 8-39)</td>
                <td>9 (30; 15-50)</td>
                <td colspan="2">8 (27; 13-46)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average<sup>b</sup></td>
                <td colspan="2">4 (13; 4-32)</td>
                <td>9 (30; 15-50)</td>
                <td colspan="2">10 (33; 18-53)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>K-pass voting<sup>c</sup></td>
                <td colspan="2">1 (3; 0-19)</td>
                <td>6 (20; 8-39)</td>
                <td colspan="2">9 (30; 15-50)<sup>d</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="7">
                  <bold>Claude-2</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 1</td>
                <td colspan="2">4 (13; 4-32)</td>
                <td>10 (33; 18-53)</td>
                <td colspan="2">19 (63; 44-79)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 2</td>
                <td colspan="2">5 (17; 6-35)</td>
                <td>8 (27; 13-46)</td>
                <td colspan="2">16 (53; 35-71)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 3</td>
                <td colspan="2">5 (17; 6-35)</td>
                <td>7 (23; 11-43)</td>
                <td colspan="2">15 (50; 33-67)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 4</td>
                <td colspan="2">5 (17; 6-35)</td>
                <td>6 (20; 8-39)</td>
                <td colspan="2">17 (57; 38-74)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 5</td>
                <td colspan="2">3 (10; 3-28)</td>
                <td>7 (23; 11-43)</td>
                <td colspan="2">18 (60; 41-77)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 6</td>
                <td colspan="2">3 (10; 3-28)</td>
                <td>7 (23; 11-43)</td>
                <td colspan="2">14 (47; 29-65)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average<sup>b</sup></td>
                <td colspan="2">4 (13; 4-32)</td>
                <td>8 (27; 13-46)</td>
                <td colspan="2">17 (57; 38-74)<sup>d</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>K-pass voting<sup>c</sup></td>
                <td colspan="2">3 (10; 3-28)</td>
                <td>7 (23; 11-43)</td>
                <td colspan="2">15 (50; 33-67)<sup>d</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>N/A: not applicable.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Accuracy by the average method.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Accuracy by k-pass voting (≥4/6 runs correct).</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>Significant between prompt-0 and prompt-2.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>The number of correct, incorrect, and unsure responses for patient-level Reporting and Data Systems categorization across different chatbots and prompts.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="190"/>
            <col width="0"/>
            <col width="90"/>
            <col width="0"/>
            <col width="90"/>
            <col width="0"/>
            <col width="90"/>
            <col width="0"/>
            <col width="90"/>
            <col width="0"/>
            <col width="90"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Chatbots and prompts</td>
                <td colspan="15">Correct/incorrect/unsure patient-level Reporting and Data Systems categories, n/n/n</td>
              </tr>
              <tr valign="bottom">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">Run 1</td>
                <td colspan="2">Run 2</td>
                <td colspan="2">Run 3</td>
                <td colspan="2">Run 4</td>
                <td colspan="2">Run 5</td>
                <td colspan="2">Run 6</td>
                <td colspan="2">Average<sup>a</sup></td>
                <td>K-pass voting<sup>b</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="18">
                  <bold>GPT-3.5</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Prompt-0</td>
                <td colspan="2">7/23/0</td>
                <td colspan="2">7/23/0</td>
                <td colspan="2">7/23/0</td>
                <td colspan="2">9/21/0</td>
                <td colspan="2">8/21/1</td>
                <td colspan="2">8/20/2</td>
                <td colspan="2">8/22/0</td>
                <td colspan="2">7/23/0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Prompt-1</td>
                <td colspan="2">13/15/2</td>
                <td colspan="2">11/19/0</td>
                <td colspan="2">8/21/1</td>
                <td colspan="2">8/21/1</td>
                <td colspan="2">11/19/0</td>
                <td colspan="2">8/22/0</td>
                <td colspan="2">10/20/0</td>
                <td colspan="2">7/23/0</td>
              </tr>
              <tr valign="top">
                <td colspan="18">
                  <bold>GPT-4</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Prompt-0</td>
                <td colspan="2">10/20/0</td>
                <td colspan="2">8/19/3</td>
                <td colspan="2">9/20/1</td>
                <td colspan="2">8/22/0</td>
                <td colspan="2">16/14/0</td>
                <td colspan="2">13/15/2</td>
                <td colspan="2">11/18/1</td>
                <td colspan="2">8/22/0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Prompt-1</td>
                <td colspan="2">15/14/1</td>
                <td colspan="2">10/18/2</td>
                <td colspan="2">11/18/1</td>
                <td colspan="2">14/15/1</td>
                <td colspan="2">15/14/1</td>
                <td colspan="2">12/18/0</td>
                <td colspan="2">13/16/1</td>
                <td colspan="2">11/19/0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Prompt-2</td>
                <td colspan="2">13/16/1</td>
                <td colspan="2">11/18/1</td>
                <td colspan="2">12/18/0</td>
                <td colspan="2">14/16/0</td>
                <td colspan="2">9/21/0</td>
                <td colspan="2">11/16/3</td>
                <td colspan="2">12/18/0</td>
                <td colspan="2">11/19/0</td>
              </tr>
              <tr valign="top">
                <td colspan="18">
                  <bold>Claude-2</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Prompt-0</td>
                <td colspan="2">13/17/0</td>
                <td colspan="2">12/18/0</td>
                <td colspan="2">12/18/0</td>
                <td colspan="2">15/15/0</td>
                <td colspan="2">10/20/0</td>
                <td colspan="2">9/21/0</td>
                <td colspan="2">12/18/0</td>
                <td colspan="2">13/17/0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Prompt-1</td>
                <td colspan="2">11/19/0</td>
                <td colspan="2">14/16/0</td>
                <td colspan="2">11/19/0</td>
                <td colspan="2">11/19/0</td>
                <td colspan="2">13/17/0</td>
                <td colspan="2">12/18/0</td>
                <td colspan="2">12/18/1</td>
                <td colspan="2">11/19/0</td>
              </tr>
              <tr valign="bottom">
                <td>
                  <break/>
                </td>
                <td>Prompt-2</td>
                <td colspan="2">21/9/0</td>
                <td colspan="2">21/9/0</td>
                <td colspan="2">20/10/0</td>
                <td colspan="2">22/8/0</td>
                <td colspan="2">21/9/0</td>
                <td colspan="2">2021/8/1</td>
                <td colspan="2">21/9/0<sup>c</sup></td>
                <td colspan="2">21/9/0</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Accuracy by the average method.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Accuracy by k-pass voting (≥4/6 runs correct).</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>Significant between prompt-0 and prompt-2.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Consistency of Chatbots</title>
        <p>As shown in <xref ref-type="table" rid="table3">Table 3</xref>, among the 30 cases evaluated in 6 runs, Claude-2 with prompt-2 showed substantial agreement (<italic>k</italic>=0.65 for overall rating; <italic>k</italic>=0.74 for RADS categorization). GPT-4, when interfaced with prompt-2, demonstrated moderate agreement (<italic>k</italic>=0.46 for overall rating; <italic>k</italic>=0.41 for RADS categorization). When evaluated with prompt-1, GPT-4 presented moderate agreement (<italic>k</italic>=0.38 for overall rating; <italic>k</italic>=0.42 for RADS categorization). In contrast, Claude-2 showed substantial agreement (<italic>k</italic>=0.63 for overall rating; <italic>k</italic>=0.61 for RADS categorization), while GPT-3.5 exhibited a range from slight to fair agreement. With prompt-0, Claude-2 showed moderate agreement (<italic>k</italic>=0.49) for overall rating and substantial agreement for RADS categorization (<italic>k</italic>=0.65). GPT4 manifested slight agreement (<italic>k</italic>=0.19) for the overall rating and fair agreement for RADS categorization. Meanwhile, GPT-3.5 showed fair agreement (<italic>k</italic>=0.28) for the overall rating and moderate agreement (<italic>k</italic>=0.57) for RADS categorization.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>The consistency of different chatbots and prompts among 6 runs.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="70"/>
            <col width="0"/>
            <col width="250"/>
            <col width="240"/>
            <col width="0"/>
            <col width="240"/>
            <col width="0"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="3">
                </td>
                <td>Prompt-0, Fleiss κ (95% CI)</td>
                <td colspan="2">Prompt-1, Fleiss κ (95% CI)</td>
                <td colspan="2">Prompt-2, Fleiss κ (95% CI)</td>
                <td>All, Fleiss κ (95% CI)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="9">
                  <bold>Patient-level RADS<sup>a</sup></bold>
                  <bold>categorization</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-3.5</td>
                <td colspan="2">0.57 (0.48-0.65)</td>
                <td>0.24 (0.15-0.32)</td>
                <td colspan="2">N/A<sup>b</sup></td>
                <td colspan="2">0.39 (0.33-0.46)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.33 (0.25-0.42)</td>
                <td>0.42 (0.34-0.5)</td>
                <td colspan="2">0.41 (0.33-0.5)</td>
                <td colspan="2">0.39 (0.34-0.44)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude-2</td>
                <td colspan="2">0.65 (0.56-0.74)</td>
                <td>0.61 (0.52-0.7)</td>
                <td colspan="2">0.74 (0.65-0.83)</td>
                <td colspan="2">0.69 (0.64-0.74)</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Overall rating</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-3.5</td>
                <td colspan="2">0.28 (0.19-0.37)</td>
                <td>0.14 (0.05-0.23)</td>
                <td colspan="2">N/A</td>
                <td colspan="2">0.21 (0.14-0.27)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.19 (0.1-0.28)</td>
                <td>0.38 (0.29-0.47)</td>
                <td colspan="2">0.46 (0.37-0.55)</td>
                <td colspan="2">0.39 (0.34-0.45)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude-2</td>
                <td colspan="2">0.49 (0.4-0.58)</td>
                <td>0.63 (0.53-0.72)</td>
                <td colspan="2">0.65 (0.56-0.75)</td>
                <td colspan="2">0.66 (0.61-0.72)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>RADS: Reporting and Data Systems.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Subgroup Analysis</title>
        <p>Since the knowledge base for ChatGPT was frozen as of September 2021, accounting for the knowledge limitations of LLMs developed before the latest RADS guideline updates, we compared the responses of different RADS criteria. The total accurate responses across 6 runs were computed for all prompts. Both GPT-4 and Claude-2 demonstrated superior performance in the context of LI-RADS CT/MRI version 2018 as opposed to Lung-RADS version 2022 and O-RADS MRI (all <italic>P</italic>&lt;.05; <xref ref-type="table" rid="table4">Table 4</xref>). <xref rid="figure3" ref-type="fig">Figure 3</xref> delineates the performance of various chatbots across different prompts and RADS categories. For the overall rating (<xref rid="figure3" ref-type="fig">Figure 3</xref>A), Claude-2 exhibited a progressive trend of enhancement of overall rating accuracy from prompt-0 to prompt-1 to prompt-2, with 20.0% (12/60), 36.7% (22/60), and 75.0% (45/60) for LIRADS; 11.7% (7/60), 18.3% (11/60), and 48.3% (29/60) for Lung-RADS; and 10.0% (6/60), 20.0% (12/60), and 41.7% (25/60) for O-RADS, respectively. Notably, with prompt-2, Claude-2 achieved the highest overall rating accuracy of 75% in older systems such as LI-RADS version 2018. Conversely, GPT-4 improved with prompt-1/2 over prompt-0, but prompt-2 did not exceed prompt-1. For the RADS categorization (<xref rid="figure3" ref-type="fig">Figure 3</xref>B), prompt-1 and prompt-2 outperformed prompt-0 for LI-RADS, irrespective of chatbots. However, for Lung-RADS and O-RADS, prompt-0 sometimes superseded prompt-1.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>The performance of chatbots within different RADS criteria<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="0"/>
            <col width="80"/>
            <col width="0"/>
            <col width="230"/>
            <col width="0"/>
            <col width="140"/>
            <col width="230"/>
            <col width="0"/>
            <col width="120"/>
            <thead>
              <tr valign="bottom">
                <td colspan="3">Chatbots and RADS<sup>b</sup></td>
                <td colspan="2">Year of development</td>
                <td colspan="2">RADS categorization (correct/incorrect/unsure), n/n/n</td>
                <td><italic>P</italic> value</td>
                <td colspan="2">Overall rating (correct/incorrect), n/n</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="11">
                  <bold>GPT-3.5</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LI-RADS<sup>c</sup> CT<sup>d</sup>/MRI<sup>e</sup></td>
                <td colspan="2">2018</td>
                <td colspan="2">32/86/2</td>
                <td colspan="2">Reference</td>
                <td>22/98</td>
                <td colspan="2">Reference</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Lung-RADS<sup>f</sup></td>
                <td colspan="2">2022</td>
                <td colspan="2">38/78/4</td>
                <td colspan="2">.83</td>
                <td>14/106</td>
                <td colspan="2">.15</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>O-RADS<sup>g</sup> MRI</td>
                <td colspan="2">2022</td>
                <td colspan="2">35/84/1</td>
                <td colspan="2">.46</td>
                <td>24/96</td>
                <td colspan="2">.87</td>
              </tr>
              <tr valign="top">
                <td colspan="11">
                  <bold>GPT-4</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LI-RADS CT/MRI</td>
                <td colspan="2">2018</td>
                <td colspan="2">104/74/2</td>
                <td colspan="2">Reference</td>
                <td>78/102</td>
                <td colspan="2">Reference</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Lung-RADS</td>
                <td colspan="2">2022</td>
                <td colspan="2">40/128/12</td>
                <td colspan="2">&lt;.001</td>
                <td>21/159</td>
                <td colspan="2">&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>O-RADS MRI</td>
                <td colspan="2">2022</td>
                <td colspan="2">67/110/3</td>
                <td colspan="2">&lt;.001</td>
                <td>40/140</td>
                <td colspan="2">&lt;.001</td>
              </tr>
              <tr valign="top">
                <td colspan="11">
                  <bold>Claude-2</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LI-RADS CT/MRI  <break/>  
              <break/>  </td>
                <td colspan="2">2018</td>
                <td colspan="2">93/86/1</td>
                <td colspan="2">Reference</td>
                <td>79/101</td>
                <td colspan="2">Reference</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Lung-RADS</td>
                <td colspan="2">2022</td>
                <td colspan="2">63/117/0</td>
                <td colspan="2">.001</td>
                <td>47/133</td>
                <td colspan="2">&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>O-RADS MRI</td>
                <td colspan="2">2022</td>
                <td colspan="2">113/67/0</td>
                <td colspan="2">.04</td>
                <td>43/137</td>
                <td colspan="2">&lt;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Data are aggregate numbers across 6 runs.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>RADS: Reporting and Data Systems.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>LI-RADS: Liver Imaging Reporting and Data System.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>CT: computed tomography.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>MRI: magnetic resonance imaging.</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>Lung-RADS: Lung CT Screening Reporting and Data System.</p>
            </fn>
            <fn id="table4fn7">
              <p><sup>g</sup>O-RADS: Ovarian-Adnexal Reporting and Data System.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>The performance of chatbots and prompts within different Reporting and Data Systems criteria. (A) Overall rating and (B) patient-level RADS categorization. LI-RADS: Liver Imaging Reporting and Data System; Lung-RADS: Lung CT (computed tomography) Screening Reporting and Data System; O-RADS: Ovarian-Adnexal Reporting and Data System.</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e55799_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Analysis of Error Types</title>
        <p>A total of 1440 cases were analyzed for error types, with details provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. The bar plot illustrating the distribution of errors across the 3 chatbots is shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>. A typical example of factual extraction error (E1) occurred in response to the seventh Lung-RADS question. The statement “The 3mm solid nodule in the lateral basal segmental bronchus is subsegmental” is inaccurate, as the lateral basal segmental bronchus represents one of the 18 defined lung segments and not a subsegment [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>The number of error types for different chatbots. E1: Factual extraction error denotes the chatbots’ inability to paraphrase the radiological findings accurately, consequently misinterpreting the information. E2: Hallucination, encompassing the fabrication of nonexistent Reporting and Data Systems (RADS) categories (E2a) and RADS criteria (E2b). E3: Reasoning error, which includes the incapacity to logically interpret the imaging description (E3a) and the RADS category accurately (E3b). The subtype errors for reasoning imaging description include the inability to reason lesion signal (E3ai), lesion size (E3aii), and enhancement (E3aiii) accurately. E4: Explanatory error, encompassing inaccurate elucidation of RADS category meaning (E4a) and erroneous explanation of the recommended management and follow-up corresponding to the RADS category (E4b).</p>
          </caption>
          <graphic xlink:href="medinform_v12i1e55799_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Hallucination of inappropriate RADS categories (E2a) occurred more frequently with prompt-0 across all 3 chatbots. However, this error rate decreased to zero for Claude-2 when using prompt-2, a trend not seen with GPT-3.5 or GPT-4. A recurrent E2a error in LI-RADS was the obsolete category LR-5V from the 2014 version, now superseded by LR-TIV in subsequent editions [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Furthermore, hallucination of invalid RADS criteria (E2b) was more prevalent than that of E2a. For instance, the LI-RADS second question response stating “T2 marked hyperintensity is a feature commonly associated with hepatocellular carcinoma (HCC)” is inaccurate, as T2-marked hyperintensity is characteristic of hemangioma and not hepatocellular carcinoma. Despite initial higher E2b rates, Claude-2 demonstrated a substantial reduction with prompt-2 (105 to 38 instances), exceeding the decrement seen with GPT-4 (71 to 57 instances).</p>
        <p>Regarding reasoning error, incorrect RADS category reasoning (E3b) was the most frequent error but decreased for all chatbots with prompt-1 and prompt-2 versus prompt-0. Claude-2 reduced errors by almost half with prompt-2, while the GPT-4 decrease was less pronounced. Lesion signal interpretation errors (E3ai) included misinterpreting hypointensity on diffusion-weighted imaging as “restricted diffusion,” rather than facilitated diffusion. Lesion size reasoning errors (E3aii) occurred in 34 of 1440 cases, predominantly by Claude-2 (25/34, 73.5%), especially in systems such as Lung-RADS and LI-RADS where size is critical for categorization. Examples were attributing a 12-mm pulmonary nodule to the ≥6-mm but &lt;8-mm range, or assigning a hepatic lesion measuring 2.3 cm × 1.5 cm to the 10- to 19-mm category. Reasoning enhancement errors (E3aiii) were exclusive to Claude-2 in O-RADS, where enhancement significantly impacts categorization. Misclassifying images at 40 seconds postcontrast as early or delayed enhancement exemplifies this error.</p>
        <p>Explanatory errors (E4) including incorrect RADS category definitions (E4a) and inappropriate management recommendations (E4b) also substantially declined with prompt-1 and prompt-2. For instance, in the first Lung-RADS question response, the statement “The 4X designation indicates infectious/inflammatory etiology is suspected.” is incorrect. Lung-RADS 4X means category 3 or 4 nodules with additional features or imaging findings that increase suspicion of lung cancer [<xref ref-type="bibr" rid="ref18">18</xref>].</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this study, we evaluated the performance of 3 chatbots—GPT-3.5, GPT-4, and Claude-2—in categorizing radiological findings according to RADS criteria. Using 3 levels of prompts providing increasing structure, examples, and domain knowledge, the chatbots’ accuracies and consistencies were quantified across 30 cases. The best performance was achieved by Claude-2 when provided with few-shot prompting and the RADS criteria PDFs. Interestingly, the chatbots tended to categorize better for the relatively older LI-RADS version 2018 criteria in contrast to the more recent Lung-RADS version 2022 and O-RADS guidelines published after the chatbots’ training cutoff.</p>
        <p>The incorporation of RADS, which standardizes reporting in radiology, has been a significant advancement, although the multiplicity and complexity of these systems impose a steep learning curve for radiologists [<xref ref-type="bibr" rid="ref13">13</xref>]. Even for subspecialized radiologists at tertiary hospitals, mastering the numerous RADS guidelines poses challenges, requiring familiarity with the lexicons, regular application in daily practice, and ongoing learning to remain current with new versions. While previous studies have shown that LLMs could assist radiologists in various tasks [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], their performance at RADS categorization from imaging findings is untested. We therefore evaluated LLMs for focused RADS categorization of testing cases.</p>
        <p>Without prompt engineering (prompt-0), all chatbots performed poorly. However, accuracy improved for all chatbots when provided an exemplar prompt demonstrating the desired response structure (prompt-1). This underscores the use of prompt tuning for aligning LLMs to specific domains such as radiology. Further enriching prompt-1 with the RADS guideline PDFs as a relevant knowledge source (prompt-2) considerably enhanced Claude-2’s accuracy, a feat not mirrored by GPT-4. This discrepancy could stem from ChatGPT’s reliance on an external plug-in to access documents, while Claude-2’s architecture accommodates the direct assimilation of expansive texts, benefiting from its larger-context window and superior long document–processing capabilities.</p>
        <p>Notably, we discerned performance disparities across RADS criteria. When queried on older established guidelines such as LI-RADS version 2018 [<xref ref-type="bibr" rid="ref17">17</xref>], the chatbots demonstrated greater accuracy than more recent schemes such as Lung-RADS version 2022 and O-RADS [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Specifically, GPT-4 and Claude-2 had significantly higher total correct ratings for LI-RADS than for Lung-RADS and O-RADS (all <italic>P</italic>&lt;.05). This could be attributed to their extensive exposure to the voluminous data related to the matured LI-RADS during their pretraining phase. With prompt-2, Claude-2 achieved 75% (45/60) accuracy for overall rating LI-RADS categorization. The poorer performance on newer RADS criteria highlights the need for strategies to continually align LLMs with the most up-to-date knowledge.</p>
        <p>A deep dive into the error-type analysis revealed informative trends. Incorrect RADS category reasoning (E3b) constituted the most frequent error across chatbots, decreasing with prompt tuning. Targeted prompting also reduced critical errors such as hallucinations of RADS criteria (E2b) and categories (E2a) likely by constraining output to valid responses. During pretraining, GPT-liked LLMs predict the next word in the unlabeled data set, risking learning fallacious relationships between RADS features. For instance, Lung-RADS version 2022 lacks categories 5 and 6 [<xref ref-type="bibr" rid="ref18">18</xref>], though some other RADS such as Breast Imaging Reporting and Data System include them [<xref ref-type="bibr" rid="ref26">26</xref>]. Using prompt-0, chatbots erroneously hallucinated Lung-RADS categories 5 and 6. Explanatory errors (E4) including inaccurate definition of the assigned RADS category (E4a) and inappropriate management recommendations (E4b) also substantially declined with prompt tuning. For instance, when queried on the novel O-RADS criteria with prompt-0, chatbots hallucinated follow-up recommendations from other RADS criteria and responded “O-RADS category 3 refers to an indeterminate adnexal mass and warrants short-interval follow-up.” Targeted prompting appears to mitigate these critical errors such as hallucination and incorrect reasoning. Careful prompt engineering is essential to properly shape LLM knowledge for radiology tasks.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>There are also several limitations in this study. First, only the LI-RADS CT/MRI and O-RADS MRI were included, excluding LI-RADS ultrasound (US) and O-RADS US guidelines, which are often practiced in an independent US department [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. Second, the chatbot’s performance was heavily dependent on prompt quality. We test only 3 types of prompts and further prompt strategies studies are warranted to investigate the impact of more exhaustive engineering on chatbots’ accuracy. Third, GPT-4-turbo was released on November 6, 2023, representing the latest GPT-4 model with improvements in instruction following, reproducible outputs, and more [<xref ref-type="bibr" rid="ref29">29</xref>]. Furthermore, its training data extend to April 2023 compared with September 2021 for the base GPT-4 model tested here. We are uncertain about this newest GPT-4-turbo model’s performance on the RADS categorization task. Evaluating GPT-4-turbo represents an important direction for future work. Fourth, our study focused on 3 of 9 RADS [<xref ref-type="bibr" rid="ref13">13</xref>], with a limited 10 cases for each RADS category. Although our choice ensured a blend of old and new guidelines and tried to cover all the RADS scores as much as possible, extending evaluations to all the RADS guidelines and incorporating more radiology reports from real clinical scenarios could offer deeper insights into potential limitations. Nonetheless, this initial study highlights critical considerations of prompt design and knowledge calibration required for safely applying LLMs in radiology. Fifth, evaluating the performance of the LLM in comparison with radiologists of varying expertise levels proves valuable for discerning its strengths and weaknesses in real-world applications. This comparative analysis will be undertaken in our forthcoming studies.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>When equipped with structured prompts and guideline PDFs, Claude-2 demonstrates potential in assigning RADS categories to radiology cases according to established criteria such as LI-RADS version 2018. However, the current generation of chatbots lags in accurately categorizing cases based on more recent RADS criteria. Our study highlights the potential of LLMs in streamlining radiological categorizations while also pinpointing the enhancements necessary for their dependable application in clinical practice for RADS categorization tasks.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>The characteristics of radiology reports for each of the Reporting and Data Systems (RADS) and the distribution of the number of the reports across the 3 RADS.</p>
        <media xlink:href="medinform_v12i1e55799_app1.docx" xlink:title="DOCX File , 107 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Representative radiology reports and prompts.</p>
        <media xlink:href="medinform_v12i1e55799_app2.docx" xlink:title="DOCX File , 18 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Links to prompts and guideline PDFs.</p>
        <media xlink:href="medinform_v12i1e55799_app3.docx" xlink:title="DOCX File , 12 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Links to prompt engineering results.</p>
        <media xlink:href="medinform_v12i1e55799_app4.docx" xlink:title="DOCX File , 11 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CT</term>
          <def>
            <p>computed tomography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">E</term>
          <def>
            <p>error</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LI-RADS</term>
          <def>
            <p>Liver Imaging Reporting &amp; Data System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">Lung-RADS</term>
          <def>
            <p>Lung CT Screening Reporting &amp; Data System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">MRI</term>
          <def>
            <p>magnetic resonance imaging</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">O-RADS</term>
          <def>
            <p>Ovarian-Adnexal Reporting &amp; Data System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">OR</term>
          <def>
            <p>odds ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">RADS</term>
          <def>
            <p>Reporting and Data Systems</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">US</term>
          <def>
            <p>ultrasound</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study has received funding from the National Natural Science Foundation of China (82371934 and 82001783) and Joint Fund of Henan Province Science and Technology R&amp;D Program (225200810062). The authors thank Chuanjian Lv, MD; Zejun Wen, MM; and Jianghua Lou, MM, for their help in drafting the radiology reports with regard to Lung CT Screening Reporting and Data System, Liver Imaging Reporting and Data System, and Ovarian-Adnexal Reporting and Data System, respectively.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>QW (Henan Provincial People’s Hospital &amp; People’s Hospital of Zhengzhou University), QW (Beijing United Imaging Research Institute of Intelligent Imaging), HL, Y Wang, YB, Y Wu, XY, and MW contributed to study design. QW (Henan Provincial People’s Hospital &amp; People’s Hospital of Zhengzhou University) and QW (Beijing United Imaging Research Institute of Intelligent Imaging) contributed to the statistical analysis. All authors contributed to the acquisition, analysis, or interpretation of the data; the drafting of the manuscript; and critical revision of the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>QW and PD are senior engineers of Beijing United Imaging Research Institute of Intelligent Imaging and United Imaging Intelligence (Beijing) Co, Ltd. JX and DS are senior specialists of Shanghai United Imaging Intelligence Co, Ltd. The companies have no role in designing and performing the surveillance and analyzing and interpreting the data. All other authors report no conflicts of interest relevant to this article.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>JH</given-names>
            </name>
          </person-group>
          <article-title>How chatbots and large language model artificial intelligence systems will reshape modern medicine: fountain of creativity or Pandora's box?</article-title>
          <source>JAMA Intern Med</source>
          <year>2023</year>
          <month>06</month>
          <day>01</day>
          <volume>183</volume>
          <issue>6</issue>
          <fpage>596</fpage>
          <lpage>597</lpage>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1835</pub-id>
          <pub-id pub-id-type="medline">37115531</pub-id>
          <pub-id pub-id-type="pii">2804310</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhayana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bleakney</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on a radiology board-style examination: insights into current strengths and limitations</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>06</month>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230582</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230582</pub-id>
          <pub-id pub-id-type="medline">37191485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kamineni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lie</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Dreyer</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Succi</surname>
              <given-names>MD</given-names>
            </name>
          </person-group>
          <article-title>Evaluating GPT as an adjunct for radiologic decision making: GPT-4 versus GPT-3.5 in a breast imaging pilot</article-title>
          <source>J Am Coll Radiol</source>
          <year>2023</year>
          <month>10</month>
          <volume>20</volume>
          <issue>10</issue>
          <fpage>990</fpage>
          <lpage>997</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37356806"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jacr.2023.05.003</pub-id>
          <pub-id pub-id-type="medline">37356806</pub-id>
          <pub-id pub-id-type="pii">S1546-1440(23)00394-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC10733745</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ueda</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mitsuyama</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Takita</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Horiuchi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Walston</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Tatekawa</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Miki</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT's diagnostic performance from patient history and imaging findings on the diagnosis please quizzes</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>07</month>
          <volume>308</volume>
          <issue>1</issue>
          <fpage>e231040</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.231040</pub-id>
          <pub-id pub-id-type="medline">37462501</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kottlors</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bratke</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rauen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kabbasch</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Persigehl</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schlamann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lennartz</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Feasibility of differential diagnosis based on imaging patterns using a large language model</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>07</month>
          <volume>308</volume>
          <issue>1</issue>
          <fpage>e231167</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.231167</pub-id>
          <pub-id pub-id-type="medline">37404149</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gertz</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Bunck</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Lennartz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dratsch</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Iuga</surname>
              <given-names>AI</given-names>
            </name>
            <name name-style="western">
              <surname>Maintz</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kottlors</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 for automated determination of radiological study and protocol based on radiology request forms: a feasibility study</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230877</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230877</pub-id>
          <pub-id pub-id-type="medline">37310247</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Truhn</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Busch</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kader</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Niehues</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Makowski</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Bressem</surname>
              <given-names>KK</given-names>
            </name>
          </person-group>
          <article-title>Leveraging GPT-4 for post hoc transformation of free-text radiology reports into structured reporting: a multilingual feasibility study</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>05</month>
          <volume>307</volume>
          <issue>4</issue>
          <fpage>e230725</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230725</pub-id>
          <pub-id pub-id-type="medline">37014240</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>MW</given-names>
            </name>
            <name name-style="western">
              <surname>Ertl-Wagner</surname>
              <given-names>BB</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of information and references using ChatGPT-3 for retrieval of clinical radiological information</article-title>
          <source>Can Assoc Radiol J</source>
          <year>2024</year>
          <month>02</month>
          <volume>75</volume>
          <issue>1</issue>
          <fpage>69</fpage>
          <lpage>73</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/abs/10.1177/08465371231171125?url_ver=Z39.88-2003&amp;rfr_id=ori:rid:crossref.org&amp;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/08465371231171125</pub-id>
          <pub-id pub-id-type="medline">37078489</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ziegelmayer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Marka</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Lenhart</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Nehls</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Reischl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Harder</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sauter</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Makowski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Graf</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gawlitza</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of GPT-4's chest x-ray impression generation: a reader study on performance and perception</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>12</month>
          <day>22</day>
          <volume>25</volume>
          <fpage>e50865</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e50865/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50865</pub-id>
          <pub-id pub-id-type="medline">38133918</pub-id>
          <pub-id pub-id-type="pii">v25i1e50865</pub-id>
          <pub-id pub-id-type="pmcid">PMC10770784</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhayana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bleakney</surname>
              <given-names>RR</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 in radiology: improvements in advanced reasoning</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>06</month>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230987</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230987</pub-id>
          <pub-id pub-id-type="medline">37191491</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Utility of ChatGPT in clinical practice</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>06</month>
          <day>28</day>
          <volume>25</volume>
          <fpage>e48568</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e48568/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48568</pub-id>
          <pub-id pub-id-type="medline">37379067</pub-id>
          <pub-id pub-id-type="pii">v25i1e48568</pub-id>
          <pub-id pub-id-type="pmcid">PMC10365580</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rau</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rau</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zoeller</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Fink</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wilpert</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Nattenmueller</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Neubauer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bamberg</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Reisert</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Russe</surname>
              <given-names>MF</given-names>
            </name>
          </person-group>
          <article-title>A context-based chatbot surpasses trained radiologists and generic ChatGPT in following the ACR appropriateness guidelines</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>07</month>
          <volume>308</volume>
          <issue>1</issue>
          <fpage>e230970</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230970</pub-id>
          <pub-id pub-id-type="medline">37489981</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>American College of Radiology</collab>
          </person-group>
          <source>Reporting and Data Systems (RADS)</source>
          <access-date>2023-08-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.acr.org/Clinical-Resources/Reporting-and-Data-Systems">https://www.acr.org/Clinical-Resources/Reporting-and-Data-Systems</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meskó</surname>
              <given-names>Bertalan</given-names>
            </name>
          </person-group>
          <article-title>Prompt engineering as an important emerging skill for medical professionals: tutorial</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>10</month>
          <day>04</day>
          <volume>25</volume>
          <fpage>e50638</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e50638/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50638</pub-id>
          <pub-id pub-id-type="medline">37792434</pub-id>
          <pub-id pub-id-type="pii">v25i1e50638</pub-id>
          <pub-id pub-id-type="pmcid">PMC10585440</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <source>OpenAI</source>
          <access-date>2023-11-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com">https://openai.com</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Anthropic</collab>
          </person-group>
          <source>Claude 2</source>
          <access-date>2023-11-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.anthropic.com/index/claude-2">https://www.anthropic.com/index/claude-2</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chernyak</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Fowler</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kamaya</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kielar</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Elsayes</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Bashir</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Kono</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Do</surname>
              <given-names>RK</given-names>
            </name>
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Singal</surname>
              <given-names>AG</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sirlin</surname>
              <given-names>CB</given-names>
            </name>
          </person-group>
          <article-title>Liver Imaging Reporting and Data System (LI-RADS) Version 2018: imaging of hepatocellular carcinoma in at-risk patients</article-title>
          <source>Radiology</source>
          <year>2018</year>
          <month>12</month>
          <volume>289</volume>
          <issue>3</issue>
          <fpage>816</fpage>
          <lpage>830</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30251931"/>
          </comment>
          <pub-id pub-id-type="doi">10.1148/radiol.2018181494</pub-id>
          <pub-id pub-id-type="medline">30251931</pub-id>
          <pub-id pub-id-type="pmcid">PMC6677371</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Kanne</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Broderick</surname>
              <given-names>LS</given-names>
            </name>
            <name name-style="western">
              <surname>Kazerooni</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>CA</given-names>
            </name>
          </person-group>
          <article-title>Update: Lung-RADS 2022</article-title>
          <source>Radiographics</source>
          <year>2023</year>
          <month>11</month>
          <volume>43</volume>
          <issue>11</issue>
          <fpage>e230037</fpage>
          <pub-id pub-id-type="doi">10.1148/rg.230037</pub-id>
          <pub-id pub-id-type="medline">37856315</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sadowski</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Thomassin-Naggara</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Rockall</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Maturen</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Forstner</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jha</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Nougaret</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Siegelman</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Reinhold</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>O-RADS MRI risk stratification system: guide for assessing adnexal lesions from the ACR O-RADS Committee</article-title>
          <source>Radiology</source>
          <year>2022</year>
          <month>04</month>
          <volume>303</volume>
          <issue>1</issue>
          <fpage>35</fpage>
          <lpage>47</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35040672"/>
          </comment>
          <pub-id pub-id-type="doi">10.1148/radiol.204371</pub-id>
          <pub-id pub-id-type="medline">35040672</pub-id>
          <pub-id pub-id-type="pmcid">PMC8962917</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <source>AskYourPDF</source>
          <access-date>2023-11-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://askyourpdf.com">https://askyourpdf.com</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <source>ChatGPT plugins</source>
          <access-date>2023-11-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/blog/chatgpt-plugins">https://openai.com/blog/chatgpt-plugins</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rasuli</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Vadera</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>Bronchopulmonary segmental anatomy</source>
          <access-date>2023-11-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.53347/rID-13644">https://doi.org/10.53347/rID-13644</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Bruix</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sherman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sirlin</surname>
              <given-names>CB</given-names>
            </name>
          </person-group>
          <article-title>LI-RADS (Liver Imaging Reporting and Data System): summary, discussion, and consensus of the LI-RADS Management Working Group and future directions</article-title>
          <source>Hepatology</source>
          <year>2015</year>
          <month>03</month>
          <volume>61</volume>
          <issue>3</issue>
          <fpage>1056</fpage>
          <lpage>1065</lpage>
          <pub-id pub-id-type="doi">10.1002/hep.27304</pub-id>
          <pub-id pub-id-type="medline">25041904</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elsayes</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hooker</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Agrons</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kielar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fowler</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chernyak</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Bashir</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kono</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Do</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Kamaya</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hecht</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Sirlin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>2017 Version of LI-RADS for CT and MR imaging: an update</article-title>
          <source>Radiographics</source>
          <year>2017</year>
          <volume>37</volume>
          <issue>7</issue>
          <fpage>1994</fpage>
          <lpage>2017</lpage>
          <pub-id pub-id-type="doi">10.1148/rg.2017170098</pub-id>
          <pub-id pub-id-type="medline">29131761</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Suarez-Weiss</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Sadowski</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Burk</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>VT</given-names>
            </name>
            <name name-style="western">
              <surname>Shinagare</surname>
              <given-names>AB</given-names>
            </name>
          </person-group>
          <article-title>Practical tips for reporting adnexal lesions using O-RADS MRI</article-title>
          <source>Radiographics</source>
          <year>2023</year>
          <volume>43</volume>
          <issue>7</issue>
          <fpage>e220142</fpage>
          <pub-id pub-id-type="doi">10.1148/rg.220142</pub-id>
          <pub-id pub-id-type="medline">37319025</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>American College of Radiology</collab>
          </person-group>
          <source>Breast Imaging Reporting &amp; Data System</source>
          <access-date>2023-11-03</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.acr.org/Clinical-Resources/Reporting-and-Data-Systems/Bi-Rads">https://www.acr.org/Clinical-Resources/Reporting-and-Data-Systems/Bi-Rads</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Strachowski</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Jha</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Phillips</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Blanchette Porter</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Froyman</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Glanc</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Reinhold</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Suh-Burgmann</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Timmerman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Andreotti</surname>
              <given-names>RF</given-names>
            </name>
          </person-group>
          <article-title>O-RADS US v2022: an update from the American College of Radiology's Ovarian-Adnexal Reporting and Data System US Committee</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>308</volume>
          <issue>3</issue>
          <fpage>e230685</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230685</pub-id>
          <pub-id pub-id-type="medline">37698472</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quaia</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>State of the art: LI-RADS for contrast-enhanced US</article-title>
          <source>Radiology</source>
          <year>2019</year>
          <volume>293</volume>
          <issue>1</issue>
          <fpage>4</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1148/radiol.2019190005</pub-id>
          <pub-id pub-id-type="medline">31453768</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <source>New models and developer products announced at DevDay</source>
          <access-date>2023-11-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/blog/new-models-and-developer-products-announced-at-devday">https://openai.com/blog/new-models-and-developer-products-announced-at-devday</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
