<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMI</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id>
      <journal-title>JMIR Medical Informatics</journal-title>
      <issn pub-type="epub">2291-9694</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v14i1e68182</article-id>
      <article-id pub-id-type="pmid">41628437</article-id>
      <article-id pub-id-type="doi">10.2196/68182</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluation of Large Language Models for Radiologists’ Support in Multidisciplinary Breast Cancer Teams: Comparative Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Matsuda</surname>
            <given-names>Shinichi</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wreyford</surname>
            <given-names>Leon</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Jiang</surname>
            <given-names>Hong</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-6550-9718</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Chun</given-names>
          </name>
          <degrees>BM</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-0636-9444</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>Wenbin</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2127-1091</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Yin</surname>
            <given-names>Cheng-liang</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8262-5749</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Zhou</surname>
            <given-names>Shan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff6" ref-type="aff">6</xref>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3605-7239</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>He</surname>
            <given-names>Rui</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff8" ref-type="aff">8</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6999-1920</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Ran</surname>
            <given-names>Guanghui</given-names>
          </name>
          <degrees>BM</degrees>
          <xref rid="aff9" ref-type="aff">9</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-0783-621X</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Wujie</given-names>
          </name>
          <degrees>BM</degrees>
          <xref rid="aff9" ref-type="aff">9</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-1320-7086</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Wu</surname>
            <given-names>Meixian</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-3607-5039</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Juan</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff9" ref-type="aff">9</xref>
          <address>
            <institution>Department of Radiology</institution>
            <institution>The First Affiliated Hospital of Shenzhen University</institution>
            <addr-line>Department of Radiology, Shenzhen Second People's Hospital, The First Affiliated of Shenzhen University, Sungang Road 3002</addr-line>
            <addr-line>Shenzhen, 518003</addr-line>
            <country>China</country>
            <phone>86 18576685849</phone>
            <email>yujuan0072@qq.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7775-5909</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Faculty of Medicine</institution>
        <institution>Macau University of Science and Technology</institution>
        <addr-line>Macao</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Statistics</institution>
        <institution>Zhuhai Clinical Medical College of Jinan University</institution>
        <addr-line>Zhuhai</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Qiandongnan Prefecture Hospital of Traditional Chinese Medicine</institution>
        <addr-line>Kai Li</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Guangdong Provincial Key Laboratory of Tumor Interventional Diagnosis and Treatment</institution>
        <addr-line>Zhuhai</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Medical Innovation Research</institution>
        <institution>Chinese PLA General Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Cancer Virology Program</institution>
        <institution>UPMC Hillman Cancer Center</institution>
        <institution>University of Pittsburgh School of Medicine</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Department of Microbiology and Molecular Genetics</institution>
        <institution>University of Pittsburgh School of Medicine</institution>
        <addr-line>Pittsburgh, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff8">
        <label>8</label>
        <institution>Grammar and Cognition Lab (GraC)</institution>
        <institution>Department of Translation &#38; Language Sciences</institution>
        <institution>Universitat Pompeu Fabra</institution>
        <addr-line>Barcelona</addr-line>
        <country>Spain</country>
      </aff>
      <aff id="aff9">
        <label>9</label>
        <institution>Department of Radiology</institution>
        <institution>The First Affiliated Hospital of Shenzhen University</institution>
        <addr-line>Shenzhen</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Juan Yu <email>yujuan0072@qq.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>2</day>
        <month>2</month>
        <year>2026</year>
      </pub-date>
      <volume>14</volume>
      <elocation-id>e68182</elocation-id>
      <history>
        <date date-type="received">
          <day>16</day>
          <month>11</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>11</day>
          <month>2</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>2</day>
          <month>12</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>2</day>
          <month>12</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Hong Jiang, Chun Yang, Wenbin Zhou, Cheng-liang Yin, Shan Zhou, Rui He, Guanghui Ran, Wujie Wang, Meixian Wu, Juan Yu. Originally published in JMIR Medical Informatics (https://medinform.jmir.org), 02.02.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://medinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://medinform.jmir.org/2026/1/e68182" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Artificial intelligence tools, particularly large language models (LLMs), have shown considerable potential across various domains. However, their performance in the diagnosis and treatment of breast cancer remains unknown.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to evaluate the performance of LLMs in supporting radiologists within multidisciplinary breast cancer teams, with a focus on their roles in facilitating informed clinical decisions and enhancing patient care.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A set of 50 questions covering radiological and breast cancer guidelines was developed to assess breast cancer. These questions were posed to 9 popular LLMs and clinical physicians, with the expectation of receiving direct “Yes” or “No” answers along with supporting analysis. The performances of the 9 models, including ChatGPT-4.0, ChatGPT-4o, ChatGPT-4o mini, Claude 3 Opus, Claude 3.5 Sonnet, Gemini 1.5 Pro, Tongyi Qianwen 2.5, ChatGLM, and Ernie Bot 3.5, were evaluated against that of radiologists with varying experience levels (resident physicians, fellow physicians, and attending physicians). Responses were assessed for accuracy, confidence, and consistency based on alignment with the 2024 National Comprehensive Cancer Network Breast Cancer Guidelines and the 2013 American College of Radiology Breast Imaging-Reporting and Data System recommendations.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Claude 3 Opus and ChatGPT-4 achieved the highest confidence scores of 2.78 and 2.74, respectively, while ChatGPT-4o led in accuracy with a score of 2.92. In terms of response consistency, Claude 3 Opus and Claude 3.5 Sonnet led the pack with scores of 3.0, closely followed by ChatGPT-4o, Gemini 1.5 Pro, and ChatGPT-4o mini, all recording impressive scores exceeding 2.9. ChatGPT-4o mini excelled in clinical diagnostics with a top score of 3.0 among all LLMs, and this score was also higher than all physician groups; however, no statistically significant differences were observed between it and any physician group (all <italic>P</italic>&#62;.05). ChatGPT-4 also had a higher score than the physician groups but showed comparable statistical performance to them (<italic>P</italic>&#62;.05). Across radiological diagnostics, clinical diagnosis, and overall performance, ChatGPT-4o mini and the Claude models achieved higher mean scores than all physician groups. However, these differences were statistically significant only when compared to fellow physicians (<italic>P</italic>&#60;.05). However, ChatGLM and Ernie Bot 3.5 underperformed across diagnostic areas, with lower scores than all physician groups but no statistically significant differences (all <italic>P</italic>&#62;.05). Among physician groups, attending physicians and resident physicians exhibited comparable high scores in radiological diagnostic performance, whereas fellow physicians scored somewhat lower, though the difference was not statistically significant (<italic>P</italic>&#62;.05).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>LLMs such as ChatGPT-4o and Claude 3 Opus showed potential in supporting multidisciplinary teams for breast cancer diagnostics and therapy. However, they cannot fully replicate the intricate decision-making processes honed through clinical experience, particularly in complex cases. This highlights the need for ongoing artificial intelligence refinement to ensure robust clinical applicability.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>breast cancer</kwd>
        <kwd>large language models</kwd>
        <kwd>LLMs</kwd>
        <kwd>radiology assistance</kwd>
        <kwd>clinical decision-making</kwd>
        <kwd>ACR BI-RADS</kwd>
        <kwd>NCCN guidelines</kwd>
        <kwd>radiologist</kwd>
        <kwd>National Comprehensive Cancer Network</kwd>
        <kwd>American College of Radiology Breast Imaging-Reporting and Data System</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Breast cancer is one of the most common malignancies in women worldwide, with over 2 million diagnoses each year, and it remains the leading cause of cancer-related deaths in women [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. According to a World Health Organization departmental news report in 2021, breast cancer mortality and incidence rates are higher in low- and middle-income countries than in high-income nations, largely due to disparities in early detection and treatment. However, the biological characteristics of breast cancer vary significantly, including imaging features, pathological traits, and lymph node assessment, making early screening and personalized treatment challenging yet crucial.</p>
      <p>A mature early screening system relies on well-established professional teams and standardized, specialized guidelines. Although there have been recent advancements, developing countries still face significant challenges in breast cancer care in terms of multidisciplinary coordination among specialist teams (surgeons, oncologists, pathologists, and radiologists), well-balanced medical resources, and adequately trained physicians [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Furthermore, many physicians rely on empiricism and practice medicine rather than evidence-based guidelines [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Therefore, there is an urgent need for effective artificial intelligence (AI) tools to support health care professionals in resource-limited settings, improve diagnostic accuracy, and enhance patient outcomes [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>In recent years, several prominent large language models (LLMs) have emerged, including ChatGPT (OpenAI), Gemini Advanced (Google LLC), Claude (Anthropic), Tongyi Qianwen (Alibaba Corporation), ChatGLM (Zhipu AI), and Ernie Bot (Baidu). These models, trained on extensive datasets, offer significant potential in health care by generating complex text through deep learning. ChatGPT, for instance, gained rapid popularity, attracting over 1 million users within days of its release. (knowledge base updated as of October 2023). Studies have shown its potential in medicine, performing well on the United States Medical Licensing Examination and the Board of Radiology-style examination [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. ChatGPT can help radiologists simplify cumbersome diagnostic imaging descriptions [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Notably, the recommendations provided by ChatGPT for the clinical management of early-diagnosed breast cancer closely align with decisions made by multidisciplinary teams (MDTs) [<xref ref-type="bibr" rid="ref13">13</xref>]. ChatGPT has indicated a level of congruence with the National Comprehensive Cancer Network (NCCN) guidelines in identifying a wide range of therapeutic agents for the treatment of advanced metastases or advanced primary tumors [<xref ref-type="bibr" rid="ref14">14</xref>]. However, recent studies highlighted significant limitations in ChatGPT-4.0, revealing a “curse reversal,” where a model trained on “A is B” may struggle to infer “B is A” [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
      <p>The rapid advancement of AI has ushered in a new era of innovation, with the emergence and continuous refinement of various LLMs profoundly transforming the health care sector. This study evaluates the performance of various LLMs in addressing breast cancer guideline-related questions, comparing them with radiologists of varying expertise levels to assess their potential in enhancing radiologists’ diagnostic capabilities.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>The study aimed to evaluate the performance of 9 LLMs in breast cancer imaging diagnosis and to compare their performance with that of breast imaging specialists. We developed a set of 50 questions focused on breast cancer diagnosis and treatment based on the 2013 version of the American College of Radiology Breast Imaging-Reporting and Data System (ACR BI-RADS) classification and the 2024 NCCN Breast Cancer Guidelines. The first 24 questions focused on radiological diagnostics, while the remaining 26 questions pertained to clinical diagnosis and treatment. Questions were then presented to breast radiologists and the 9 LLMs for diagnostic responses. The accuracy, confidence, and consistency of each model in responding to the diagnostic queries were assessed. Furthermore, we conducted a comparative analysis of each model’s performance against radiologists possessing varying levels of clinical experience. The study design is provided in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Graph in the full-text abstract: the study design, including question categorization, experiment, and evaluation workflow. ACR BI-RADS: American College of Radiology Breast Imaging-Reporting and Data System; LLM: Large Language Model; NCCN: National Comprehensive Cancer Network.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e68182_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Participants and Models</title>
        <p>The 9 LLMs included ChatGPT-4.0, ChatGPT-4o, ChatGPT-4o mini, Claude 3 Opus, Claude 3.5 Sonnet, Gemini 1.5 Pro, Tongyi Qianwen 2.5, ChatGLM, and Ernie Bot 3.5. In addition, the study recruited a total of nine board-certified radiologists, stratified into three distinct experience levels: (1) resident physicians (n=3; 1-5 years of postlicensure experience), undergoing subspecialty training to develop foundational clinical expertise; (2) fellow physicians (n=3; 5-10 years of experience), having completed residency and pursuing advanced fellowship training (1-3 years) to specialize in breast imaging diagnostics; and (3) attending physicians (n=3; &#62;10 years of experience), senior specialists with independent diagnostic authority, responsible for clinical decision-making and trainee supervision. This hierarchical cohort enabled comparative benchmarking of LLM performance against progressive stages of radiologist expertise. Each physician received the same set of 50 questions.</p>
      </sec>
      <sec>
        <title>Questions Design</title>
        <p>The set of 50 questions was developed based on guidelines extracted from the 2024 NCCN Breast Cancer Guidelines and the 2013 ACR BI-RADS diagnostic criteria, which are widely regarded as the gold standards in clinical breast cancer diagnosis. These criteria are commonly adhered to by physicians in clinical breast cancer diagnosis. In this study, we used these same standards to evaluate the responses provided by the LLMs and physicians to ensure validity and fair comparison. The questions covered a range of common clinical diagnostic decision-making scenarios, providing radiological evidence, such as mammography, ultrasound, and magnetic resonance imaging, as well as clinical information indicators such as medical history, physical examination findings, and laboratory test results, across preoperative, intraoperative, and postoperative stages.</p>
      </sec>
      <sec>
        <title>Procedure</title>
        <p>Nine physicians were divided into 3 groups based on their experience levels, with each physician answering all 50 questions once. They were prohibited from consulting guidelines or reference materials and were required to complete the questions within 30 minutes. Each physician independently completed the assessment to minimize recall bias. The study aimed to simulate real-world conditions by controlling time and eliminating external aids.</p>
        <p>All LLMs were queried via the Google application programming interface on a single date (August 21, 2024) without any additional context (such as task background or medical knowledge priming) and generated responses accordingly. Each LLM independently answered every question 3 times. Standard responses were defined as “yes” or “no.” A response was classified as “yes” if it demonstrated a strong affirmative inclination, supported by compelling theoretical evidence and logical reasoning. Conversely, a response was classified as “no” if it showed a clear negative inclination, backed by equally robust theoretical evidence and sound logic. If an LLM produced an ambiguous response, lacked sufficient evidence, or contained flawed reasoning, it was prompted to reanswer. If the reanswered response remained unsatisfactory, 2 senior physicians with over 10 years of experience were recruited to classify the response based on their professional judgment, thereby evaluating the AI model’s accuracy. In cases of disagreement, the evaluators reevaluated the response based on the 2024 NCCN Guidelines and the 2013 ACR BI-RADS criteria to reach a consensus through discussion.</p>
        <p>All LLM-generated responses were anonymized, randomized, and then assessed by evaluators who were blinded to the source (LLM identity) of each response. The accuracy of each response was assessed against the diagnostic NCCN guidelines. Responses consistent with the guidelines were assigned a score of 1, while inconsistent responses received a score of 0. The accuracy score for each question was calculated by summing the scores of the 3 responses. The overall accuracy performance of both the LLMs and physician groups was determined by averaging the accuracy scores across the 50 questions, resulting in a score ranging from 0 to 3. Subsequently, the accuracy performance of the LLMs was compared to that of the physician groups through rigorous statistical analysis.</p>
      </sec>
      <sec>
        <title>Confidence and Consistency</title>
        <p>Confidence refers to the clarity of responses, categorized as either “confident” or “nonconfident.” A confident answer provides an explicit “yes” or “no” to questions, while a nonconfident answer lacks a direct “yes or no” statement, despite potentially containing detailed explanations, requiring additional prompting to elicit a definitive response. Confident answers received are scored 1, while nonconfident answers are scored 0. The confidence score for each question is calculated by summing the scores from 3 independent response attempts. Each LLM’s overall confidence performance is assessed by averaging these scores across 50 questions, resulting in a score ranging from 0 to 3. The confidence ratio represents each LLM’s average confidence score divided by the maximum possible score (ie, 3).</p>
        <p>Consistency evaluates whether an LLM provides uniform responses across 3 independent attempts to answer the same question. If all 3 answers are identical, the responses are considered consistent, regardless of their alignment with guideline criteria. For consistency assessment, an LLM receives 3 points if all 3 answers to a question are identical, and 2 points if one answer differs from the others. Each LLM’s consistency performance is evaluated by averaging these scores across 50 questions, with possible scores ranging from 2 to 3. The consistency ratio is defined as each LLM’s average consistency score divided by the maximum possible score (ie, 3).</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>R software (version 4.4.2; R Foundation for Statistical Computing) was used for data management and statistical analysis. Continuous variables were described using means, and specific group instances were expressed as absolute values. The Wilcoxon rank-sum test was used for score comparisons between 2 different groups. We opted not to perform a Kruskal-Wallis test (for overall multigroup comparison) before pairwise analyses because our research focus was on hypothesis-driven, targeted comparisons between 2 specific groups: each LLM vs each physician subgroup (attendings, fellows, and residents). This design directly aligned with our aim—to evaluate whether single LLMs differ in performance from radiologists at distinct experience levels—rather than testing for overall differences across all groups. Differences between groups were considered significant at <italic>P</italic>&#60;.05. <italic>P</italic> value correction for multiple comparisons was applied using the Benjamini-Hochberg method. Adjusted <italic>P</italic> values are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, <xref ref-type="table" rid="table2">Tables 2</xref>-<xref ref-type="table" rid="table4">4</xref>, and corresponding <xref rid="figure2" ref-type="fig">Figures 2</xref>-<xref rid="figure4" ref-type="fig">4</xref> to present direct pairwise comparison results; raw <italic>P</italic> values for multiple comparisons are provided exclusively in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Accuracy scores of large language models (LLMs) compared to physicians in clinical diagnosis and treatment questions of breast cancer. 3-star or dot symbols on each violin from top to bottom represent <italic>P</italic> values compared to each LLM to attending, fellow, and resident
physicians, using the Wilcoxon rank-sum test. Symbols are as follows: “***” represents <italic>P</italic>&#60;.001, “**” represents .001≤<italic>P</italic>&#60;.01, “*” represents .01≤<italic>P</italic>&#60;.05,
and “black dots” represents P≥.05.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e68182_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Accuracy scores of large language models (LLMs) compared to those of physicians in radiological diagnosis questions of breast cancer.3-star or dot symbols on each violin from top to bottom represent <italic>P</italic> values compared to each LLM to attending, fellow, and resident
physicians, using the Wilcoxon rank-sum test. Symbols are as follows: “***” represents <italic>P</italic>&#60;.001, “**” represents .001≤<italic>P</italic>&#60;.01, “*” represents .01≤<italic>P</italic>&#60;.05,
and “black dots” represents P≥.05.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e68182_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Accuracy scores of large language models (LLMs) compared to those of physicians in totalfor all 50 questions of breast cancer questions.3-star or dot symbols on each violin from top to bottom represent <italic>P</italic> values compared to each LLM to attending, fellow, and resident
physicians, using the Wilcoxon rank-sum test. Symbols are as follows: “***” represents <italic>P</italic>&#60;.001, “**” represents .001≤<italic>P</italic>&#60;.01, “*” represents .01≤<italic>P</italic>&#60;.05,
and “black dots” represents P≥.05.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e68182_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Upon review by the Ethics Committee of Zhuhai People’s Hospital, the information and content provided by the researcher did not involve the collection, storage, or use of human samples, and, therefore, met the requirements for exemption from ethical review.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Representative Examples</title>
        <p><xref ref-type="boxed-text" rid="box1">Textbox 1</xref> provides a representative selection of question-answer examples from the 9 LLMs, while a more comprehensive collection of cases is provided in the <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
        <boxed-text id="box1" position="float">
          <title>Representative examples of questions and answers from the 9 large language models (LLMs).</title>
          <p>
            <bold>Sample question:</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Can an oval, hypoechoic lesion, growing nonparallel or vertically, with a heterogeneous echo pattern and obscured margins on breast ultrasound, be classified as Breast Imaging Reporting and Data System (BI-RADS 4b)?</p>
            </list-item>
          </list>
          <p>
            <bold>ChatGPT-4 response:</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Yes, an oval, hypoechoic lesion growing nonparallel (vertically) with a heterogeneous echo pattern and obscured margins observed on breast ultrasound could be classified as BI-RADS 4b. This classification indicates a moderate suspicion of malignancy, meaning the lesion exhibits characteristics that are more suspicious than those typically found in BI-RADS 4a but not as definitively malignant as those in BI-RADS 4c.</p>
            </list-item>
            <list-item>
              <p>The characteristics you described—nonparallel orientation (often termed “taller than wide”), heterogeneous echo pattern, and obscured margins—suggest a higher level of concern. These features are often associated with a greater likelihood of malignancy:</p>
              <list list-type="bullet">
                <list-item>
                  <p>Nonparallel or vertical orientation: this orientation is more typical of malignant lesions, as benign masses often grow in parallel with the skin.</p>
                </list-item>
                <list-item>
                  <p>Heterogeneous echo pattern: this can indicate variability within the lesion, which is more common in malignant tumors than in benign growths.</p>
                </list-item>
                <list-item>
                  <p>Obscured margins: this suggests that the lesion’s borders are not clearly defined, which can be an indication of invasive growth, a characteristic more typical of malignant processes.</p>
                </list-item>
              </list>
            </list-item>
            <list-item>
              <p>Conclusion: a lesion under 5 mm without a lipid core showing a washout pattern on magnetic resonance imaging should be approached with a high degree of suspicion for malignancy. This situation requires proactive management, including possible biopsy and close imaging follow-up, to ensure timely diagnosis and treatment.</p>
            </list-item>
          </list>
          <p>
            <bold>ChatGPT-4o response:</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Yes, an oval, hypoechoic lesion that is growing nonparallel (vertically oriented), with a heterogeneous echo pattern and obscured margins on breast ultrasound can be classified as BI-RADS 4b. BI-RADS 4b is used to describe lesions with a moderate suspicion of malignancy.</p>
            </list-item>
            <list-item>
              <p>Key features of BI-RADS 4b:</p>
              <list list-type="bullet">
                <list-item>
                  <p>Oval shape: while an oval shape can suggest a benign lesion, other features, such as orientation, margin appearance, and internal echo pattern, significantly influence the level of suspicion.</p>
                </list-item>
                <list-item>
                  <p>Nonparallel growth (vertically oriented): a lesion that grows nonparallel, or “taller than wide,” raises concern for malignancy. This orientation is often associated with invasive tumors because malignant lesions tend to invade tissue in a vertical direction.</p>
                </list-item>
                <list-item>
                  <p>Heterogeneous echo pattern: a heterogeneous echo pattern, where the lesion appears uneven or varied in echogenicity, is suspicious because it can indicate the presence of both solid and cystic components, or areas of differing cellular density, which is common in malignant tumors.</p>
                </list-item>
                <list-item>
                  <p>Obscured margins: obscured or indistinct margins are another red flag for malignancy. Clear, well-defined margins are typically seen in benign lesions, while indistinct margins suggest invasion into surrounding tissues.</p>
                </list-item>
              </list>
            </list-item>
            <list-item>
              <p>In conclusion, the combination of nonparallel growth, heterogeneous echo pattern, and obscured margins justifies a BI-RADS 4b classification, meaning the lesion has a moderate risk of malignancy and requires biopsy for a definitive diagnosis.</p>
            </list-item>
          </list>
          <p>
            <bold>ChatGPT-4o mini response:</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Yes, an oval, hypoechoic lesion with a nonparallel (vertically oriented) growth pattern, heterogeneous echo pattern, and obscured margins on breast ultrasound can be classified as BI-RADS 4b.</p>
            </list-item>
            <list-item>
              <p>Here’s why this classification might be appropriate:</p>
              <list list-type="bullet">
                <list-item>
                  <p>Oval shape: while typically less suspicious than irregular shapes, an oval shape alone does not rule out malignancy.</p>
                </list-item>
                <list-item>
                  <p>Hypoechoic: a darker appearance on ultrasound is often associated with both benign and malignant lesions.</p>
                </list-item>
                <list-item>
                  <p>Nonparallel growth pattern: a vertical or irregular growth pattern can be more concerning, as it may suggest infiltration into surrounding tissues.</p>
                </list-item>
                <list-item>
                  <p>Heterogeneous echo pattern: variability in echo patterns within a lesion can be indicative of more complex internal structures, which may be suggestive of malignancy.</p>
                </list-item>
                <list-item>
                  <p>Obscured margins: margins that are not well-defined or are irregularly shaped can be a sign of malignancy.</p>
                </list-item>
              </list>
            </list-item>
            <list-item>
              <p>BI-RADS 4b indicates a moderate suspicion of malignancy, with a likelihood of 10%-50%. This classification suggests that while there is a notable suspicion of cancer, the probability is not as high as in BI-RADS 4c. A biopsy is recommended to obtain a definitive diagnosis.</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Confidence Level</title>
        <p>Claude 3 Opus achieved the highest average score of 2.78, followed by ChatGPT-4 with a score of 2.74. The remaining models scored as follows: ChatGPT-4o 2.52, ChatGPT-4o mini 2.58, Claude 3.5, Sonnet 2.34, Gemini 1.5 Pro 2.28, Tongyi Qianwen 2.5 1.62, ChatGLM 1.98, and Ernie Bot 3.5 1.50. Claude 3 Opus and ChatGPT-4 demonstrated significantly higher confidence compared to the other models (<xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure5" ref-type="fig">Figure 5</xref>).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Confidence and consistency levels of the large language models (LLMs).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="340"/>
            <col width="330"/>
            <col width="330"/>
            <thead>
              <tr valign="top">
                <td>Methods</td>
                <td>Confidence level</td>
                <td>Consistency level</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>ChatGPT-4</td>
                <td>2.74</td>
                <td>2.90</td>
              </tr>
              <tr valign="top">
                <td>ChatGPT-4o</td>
                <td>2.52</td>
                <td>2.96</td>
              </tr>
              <tr valign="top">
                <td>ChatGPT-4o mini</td>
                <td>2.58</td>
                <td>2.92</td>
              </tr>
              <tr valign="top">
                <td>Claude 3 Opus</td>
                <td>2.78</td>
                <td>3.00</td>
              </tr>
              <tr valign="top">
                <td>Claude 3.5 Sonnet</td>
                <td>2.34</td>
                <td>3.00</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Pro</td>
                <td>2.28</td>
                <td>2.96</td>
              </tr>
              <tr valign="top">
                <td>Tongyi Qianwen 2.5</td>
                <td>1.62</td>
                <td>2.86</td>
              </tr>
              <tr valign="top">
                <td>ChatGLM</td>
                <td>1.98</td>
                <td>2.72</td>
              </tr>
              <tr valign="top">
                <td>Ernie Bot 3.5</td>
                <td>1.50</td>
                <td>2.82</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Percentage of accuracy, confidence, and consistency levels of large language models (LLMs) across all 50 questions.</p>
          </caption>
          <graphic xlink:href="medinform_v14i1e68182_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Consistency</title>
        <p>Claude 3 Opus and Claude 3.5 Sonnet achieved the highest score of 3.0, demonstrating their superiority. The remaining models achieved the following consistency scores: ChatGPT-4o: 2.96, Gemini 1.5 Pro: 2.96, ChatGPT-4o mini: 2.92, ChatGPT4: 2.90, Tongyi Qianwen 2.5: 2.86, Ernie Bot 3.5: 2.82, and ChatGLM: 2.71 (<xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure5" ref-type="fig">Figure 5</xref>).</p>
      </sec>
      <sec>
        <title>Performances on Clinical Diagnostics and Treatment Questions</title>
        <p>In the comparison of clinical diagnostic problems, the mean accuracy scores are 3 (SD 0), 2.92 (SD 0.39), 2.81 (SD 0.69), 2.81 (0.49), 2.81 (SD 0.63), 2.77 (SD 0.65), 2.77 (SD 0.82), 2.77 (SD 0.82), 2.65 (SD 0.98), 2.38 (SD 1.02), and 2.35 (SD 1.13) for ChatGPT-4o mini, ChatGPT-4o, Tongyi Qianwen 2.5, attending physicians, fellow physicians, ChatGPT-4, Claude 3 Opus, Claude 3.5 Sonnet, Gemini 1.5 Pro, Ernie Bot 3.5, and ChatGLM, respectively. No significant differences were observed between any language modeling group and the various physician groups. Regarding comparisons among physician groups, no significant differences were detected between different physician subgroups (<xref ref-type="table" rid="table2">Table 2</xref>, <xref rid="figure2" ref-type="fig">Figure 2</xref>, and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app4">4</xref>).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Accuracy score comparisons of large language models (LLMs) with physician groups for clinical diagnostic and treatment questions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="220"/>
            <col width="190"/>
            <col width="210"/>
            <col width="180"/>
            <col width="200"/>
            <thead>
              <tr valign="bottom">
                <td>Methods</td>
                <td>Value, mean (SD)</td>
                <td colspan="3">FDR<sup>a</sup>-adjusted <italic>P</italic> value</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Attending physician</td>
                <td>Fellow physician</td>
                <td>Resident physician</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>ChatGPT-4</td>
                <td>2.77 (0.65)</td>
                <td>.84</td>
                <td>.97</td>
                <td>.66</td>
              </tr>
              <tr valign="top">
                <td>ChatGPT-4o</td>
                <td>2.92 (0.39)</td>
                <td>.44</td>
                <td>.81</td>
                <td>.49</td>
              </tr>
              <tr valign="top">
                <td>ChatGPT-4o mini</td>
                <td>3 (0)</td>
                <td>.37</td>
                <td>.42</td>
                <td>.23</td>
              </tr>
              <tr valign="top">
                <td>Claude 3 Opus</td>
                <td>2.77 (0.82)</td>
                <td>.62</td>
                <td>.90</td>
                <td>.49</td>
              </tr>
              <tr valign="top">
                <td>Claude 3.5 Sonnet</td>
                <td>2.77 (0.82)</td>
                <td>.62</td>
                <td>.90</td>
                <td>.49</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Pro</td>
                <td>2.65 (0.98)</td>
                <td>.84</td>
                <td>.97</td>
                <td>.66</td>
              </tr>
              <tr valign="top">
                <td>Tongyi Qianwen 2.5</td>
                <td>2.81 (0.69)</td>
                <td>.62</td>
                <td>.90</td>
                <td>.49</td>
              </tr>
              <tr valign="top">
                <td>ChatGLM</td>
                <td>2.35 (1.13)</td>
                <td>.44</td>
                <td>.44</td>
                <td>.49</td>
              </tr>
              <tr valign="top">
                <td>Ernie Bot 3.5</td>
                <td>2.38 (1.02)</td>
                <td>.44</td>
                <td>.42</td>
                <td>.49</td>
              </tr>
              <tr valign="top">
                <td>Attending physician</td>
                <td>2.81 (0.49)</td>
                <td>—<sup>b</sup></td>
                <td>.90</td>
                <td>.74</td>
              </tr>
              <tr valign="top">
                <td>Fellow physician</td>
                <td>2.81 (0.63)</td>
                <td>—</td>
                <td>—</td>
                <td>.66</td>
              </tr>
              <tr valign="top">
                <td>Resident physician</td>
                <td>2.77 (0.51)</td>
                <td>—</td>
                <td>—</td>
                <td> —</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>FDR: false discovery rate.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Not applicable.</p>
            </fn>
            <fn id="table2fn3">
              <p>In <xref rid="figure2" ref-type="fig">Figure 2</xref>, 3-star or dot symbols on each violin from top to bottom represent <italic>P</italic> values compared to each LLM to attending, fellow, and resident physicians, using the Wilcoxon rank-sum test. Symbols are as follows: “***” represents <italic>P</italic>&#60;.001, “**” represents .001≤<italic>P</italic>&#60;.01, “*” represents .01≤<italic>P</italic>&#60;.05, and “.” represents <italic>P</italic>≥.05.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Performances on Radiological Diagnostics Problems</title>
        <p>In the comparison of radiological diagnostic problems, the scores were as follows: ChatGPT-4 2.79, ChatGPT-4o 2.92, ChatGPT-4o mini 2.54, Claude 3 Opus 2.75, Claude 3.5 Sonnet 2.75, Gemini 1.5 Pro 2.5, Tongyi Qianwen 2.29, ChatGLM 2.0, and Ernie Bot 3.5 1.58. The attending physicians scored 2.46, the fellow physicians scored 2.04, and the resident physicians scored 2.5. The differences between ChatGPT-4, ChatGPT-4o, ChatGPT-4o mini, Claude 3 Opus, Claude 3.5 Sonnet, and Gemini 1.5 Pro and the fellow physicians were statistically significant (<italic>P</italic>&#60;.05). There were statistically significant differences between ChatGPT-4o and the resident physicians (<italic>P</italic>=.04). In contrast, the differences between Tongyi Qianwen 2.5, ChatGLM, and the physician groups were not statistically significant (<italic>P</italic>&#62;.05). In comparisons among physician groups, the differences were not statistically significant (<italic>P</italic>&#62;.05; <xref ref-type="table" rid="table3">Table 3</xref>, <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app4">4</xref>, and <xref rid="figure5" ref-type="fig">Figure 5</xref>).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Accuracy score comparisons of large language models (LLMs) with physician groups for radiological diagnostic questions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="220"/>
            <col width="190"/>
            <col width="210"/>
            <col width="180"/>
            <col width="200"/>
            <thead>
              <tr valign="bottom">
                <td>Methods</td>
                <td>Value, mean (SD)</td>
                <td colspan="3">FDR<sup>a</sup>-adjusted <italic>P</italic> value</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Attending physician</td>
                <td>Fellow physician</td>
                <td>Resident physician</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>ChatGPT-4</td>
                <td>2.79 (0.66)</td>
                <td>.10</td>
                <td>&#60;.001</td>
                <td>.07</td>
              </tr>
              <tr valign="top">
                <td>ChatGPT-4o</td>
                <td>2.92 (0.41)</td>
                <td>.06</td>
                <td>&#60;.001</td>
                <td>.04</td>
              </tr>
              <tr valign="top">
                <td>ChatGPT-4o mini</td>
                <td>2.54 (0.93)</td>
                <td>.51</td>
                <td>.01</td>
                <td>.45</td>
              </tr>
              <tr valign="top">
                <td>Claude 3 Opus</td>
                <td>2.75 (0.85)</td>
                <td>.09</td>
                <td>&#60;.001</td>
                <td>.07</td>
              </tr>
              <tr valign="top">
                <td>Claude 3.5 Sonnet</td>
                <td>2.75 (0.85)</td>
                <td>.09</td>
                <td>&#60;.001</td>
                <td>.07</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Pro</td>
                <td>2.5 (1.06)</td>
                <td>.50</td>
                <td>.01</td>
                <td>.41</td>
              </tr>
              <tr valign="top">
                <td>Tongyi Qianwen 2.5</td>
                <td>2.29 (1.08)</td>
                <td>.81</td>
                <td>.15</td>
                <td>.97</td>
              </tr>
              <tr valign="top">
                <td>ChatGLM</td>
                <td>2 (1.1)</td>
                <td>.22</td>
                <td>.84</td>
                <td>.22</td>
              </tr>
              <tr valign="top">
                <td>Ernie Bot 3.5</td>
                <td>1.58 (1.44)</td>
                <td>.09</td>
                <td>.49</td>
                <td>.09</td>
              </tr>
              <tr valign="top">
                <td>Attending physician</td>
                <td>2.46 (0.78)</td>
                <td>—<sup>b</sup></td>
                <td>.07</td>
                <td>.99</td>
              </tr>
              <tr valign="top">
                <td>Fellow physician</td>
                <td>2.04 (0.75)</td>
                <td>—</td>
                <td>—</td>
                <td>.07</td>
              </tr>
              <tr valign="top">
                <td>Resident physician</td>
                <td>2.5 (0.66)</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>FDR: false discovery rate.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Not applicable.</p>
            </fn>
            <fn id="table3fn3">
              <p>In <xref rid="figure3" ref-type="fig">Figure 3</xref>, 3-star or dot symbols on each violin from top to bottom represent <italic>P</italic> values compared to each LLM to attending, fellow, and resident physicians, using the Wilcoxon rank-sum test. Symbols are as follows: “***” represents <italic>P</italic>&#60;.001, “**” represents .001≤<italic>P</italic>&#60;.01, “*” represents .01≤<italic>P</italic>&#60;.05, and “.” represents <italic>P</italic>≥.05.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Performances on Full Set Questions</title>
        <p>In the comparison of overall performance, the scores were as follows: ChatGPT-4o 2.92, ChatGPT-4 2.78, ChatGPT-4o mini 2.78, Claude 3 Opus 2.76, Claude 3.5 Sonnet 2.76, Gemini 1.5 Pro 2.58, Tongyi Qianwen 2.56, ChatGLM 2.18, and Ernie Bot 3.5 2.0. The attending physicians scored 2.64, the fellow physicians scored 2.44, and the resident physicians scored 2.64. The differences between ChatGPT-4, ChatGPT-4o, ChatGPT-4o mini, Claude 3 Opus, Claude 3.5 Sonnet, and the fellow physicians were statistically significant (<italic>P</italic>&#60;.05). Significant differences were observed between ChatGPT-4o and resident physicians (<italic>P</italic>=.01) and between ChatGPT-4o and attending physicians (<italic>P</italic>=.03). In the comparisons among physician groups, none of the differences were statistically significant (<italic>P</italic>&#62;.05; <xref ref-type="table" rid="table4">Table 4</xref>, <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app4">4</xref>, and <xref rid="figure4" ref-type="fig">Figure 4</xref>).</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Accuracy score comparisons of large language model (LLM) models with physician groups for all 50 questions.</p>
          </caption>
          <table border="1" rules="groups" cellpadding="5" frame="hsides" width="1000" cellspacing="0">
            <col width="220"/>
            <col width="190"/>
            <col width="210"/>
            <col width="180"/>
            <col width="200"/>
            <thead>
              <tr valign="bottom">
                <td>Methods</td>
                <td>Value, mean (SD)</td>
                <td colspan="3">FDR<sup>a</sup>-adjusted <italic>P</italic> value</td>
              </tr>
              <tr valign="top">
                <td/>
                <td/>
                <td>Attending physician</td>
                <td>Fellow physician</td>
                <td>Resident physician</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>ChatGPT-4</td>
                <td>2.78 (0.65)</td>
                <td>.13</td>
                <td>.006</td>
                <td>.09</td>
              </tr>
              <tr valign="top">
                <td>ChatGPT-4o</td>
                <td>2.92 (0.4)</td>
                <td>.03</td>
                <td>&#60;.001</td>
                <td>.01</td>
              </tr>
              <tr valign="top">
                <td>ChatGPT-4o mini</td>
                <td>2.78 (0.68)</td>
                <td>.13</td>
                <td>.006</td>
                <td>.09</td>
              </tr>
              <tr valign="top">
                <td>Claude 3 Opus</td>
                <td>2.76 (0.82)</td>
                <td>.08</td>
                <td>.003</td>
                <td>.05</td>
              </tr>
              <tr valign="top">
                <td>Claude 3.5 Sonnet</td>
                <td>2.76 (0.82)</td>
                <td>.08</td>
                <td>.003</td>
                <td>.05</td>
              </tr>
              <tr valign="top">
                <td>Gemini 1.5 Pro</td>
                <td>2.58 (1.01)</td>
                <td>.47</td>
                <td>.06</td>
                <td>.30</td>
              </tr>
              <tr valign="top">
                <td>Tongyi Qianwen 2.5</td>
                <td>2.56 (0.93)</td>
                <td>.84</td>
                <td>.18</td>
                <td>.67</td>
              </tr>
              <tr valign="top">
                <td>ChatGLM</td>
                <td>2.18 (1.12)</td>
                <td>.09</td>
                <td>.47</td>
                <td>.12</td>
              </tr>
              <tr valign="top">
                <td>Ernie Bot 3.5</td>
                <td>2 (1.29)</td>
                <td>.08</td>
                <td>.26</td>
                <td>.09</td>
              </tr>
              <tr valign="top">
                <td>Attending physician</td>
                <td>2.64 (0.66)</td>
                <td>—<sup>b</sup></td>
                <td>.19</td>
                <td>.78</td>
              </tr>
              <tr valign="top">
                <td>Fellow physician</td>
                <td>2.44 (0.79)</td>
                <td>—</td>
                <td>—</td>
                <td>.30</td>
              </tr>
              <tr valign="top">
                <td>Resident physician</td>
                <td>2.64 (0.6)</td>
                <td>—</td>
                <td>—</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>FDR: false discovery rate.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>Not applicable.</p>
            </fn>
            <fn id="table4fn3">
              <p>In <xref rid="figure4" ref-type="fig">Figure 4</xref>, 3-star or dot symbols on each violin from top to bottom represented <italic>P</italic> values compared to each LLM model to attending, fellow, and resident physicians, with the Wilcoxon rank-sum test. Symbols are as follows: “***” represents <italic>P</italic>&#60;.001, “**” represents .001≤<italic>P</italic>&#60;.01, “*” represents .01≤<italic>P</italic>&#60;.05, and “.” represents <italic>P</italic>≥.05.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>With the global population aging, the rising incidence of cancer, and the increasing complexity of treatment options, there is a growing demand for MDT discussions. However, implementing high-level MDTs in underdeveloped countries remains a significant challenge [<xref ref-type="bibr" rid="ref16">16</xref>], largely because of the knowledge barriers between specialist doctors. An important question is how specialist doctors in less developed regions can achieve the same medical standards as those in developed regions, easily access authoritative and professional medical knowledge, and overcome barriers between specialties. The recently developed chatbot can theoretically provide instant, evidence-based responses, demonstrating significant potential as an ideal tool for enhancing high-quality health care in underserved regions. In our comparison of 9 LLMs, Claude 3 Opus demonstrated the highest confidence score at 2.78, followed by ChatGPT-4 (2.74) and ChatGPT-4o mini (2.58). These scores suggest that these models exhibit a higher degree of certainty when managing complex medical issues. In contrast, Ernie Bot 3.5’s confidence score of 1.50 was significantly lower, indicating a reduced ability to comprehend and process medical problems. These findings underscore the varying levels of proficiency among language models in accurately addressing the same problem. Interestingly, unlike human practitioners, not all chatbots confidently respond to closed-ended questions with a definitive “yes” or “no,” suggesting that training data diversity and model architecture significantly impact diagnostic accuracy. This uncertainty may lead to reduced consistency and accuracy of their outputs, thus limiting their independent use in practical clinical applications, making them inseparable from close collaboration with physicians during decision-making.</p>
        <p>In addition, in the consistency assessment, Claude 3 Opus and Claude 3.5 Sonnet achieved the highest scores of 3, followed by ChatGPT-4o and Gemini 1.5 Pro, both scoring 2.96, with ChatGPT-4o mini at 2.92. In contrast, ChatGLM and Ernie Bot 3.5 exhibited lower consistency scores of 2.72 and 2.82, respectively. These results highlight the differences in response stability across the LLMs, a factor that is especially critical for clinical decision support. In clinical applications, particularly in dynamic and complex environments, models lacking sufficient consistency may undermine confidence in their outputs. Although Claude 3 Opus and Claude 3.5 Sonnet surpassed the ChatGPT models in response consistency (scoring 3), the ChatGPT models demonstrated superior performance in diagnostic accuracy. However, none of the models provided correct answers to all question sets. Therefore, while AI introduces new possibilities in fields such as medicine, it also presents challenges that require careful expert scrutiny to avoid imposing additional burdens on patients and health care professionals [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p>
        <p>By comparing the nine LLMs with the responses of nine radiologists on the overall 50 guideline questions, ChatGPT-4o achieved the highest score of 2.92, significantly outperforming all physician groups. The difference between ChatGPT-4o and all physician groups (<italic>P</italic>&#60;.05) was statistically significant. ChatGPT-4, ChatGPT-4o mini, Claude 3 Opus, and Claude 3.5 Sonnet all scored higher on average than the physician groups. However, the difference was statistically significant only for the comparison with fellow physicians (<italic>P</italic>&#60;.05). In contrast, Gemini 1.5 Pro and Tongyi Qianwen 2.5 had lower scores, with no statistically significant differences compared to each group of doctors (<italic>P</italic>&#62;.05). ChatGLM and Ernie Bot 3.5 showed lower scores than all physician groups, but the differences were not statistically significant. No statistically significant differences (<italic>P</italic>&#62;.05) were found in the comparisons between each group of doctors, suggesting that attending physicians, fellow physicians, and resident physicians performed comparably in answering the entire question set. Attending physicians and resident physicians both scored 2.64, which may reflect a more consistent ability to answer certain routine questions across all years of experience. Fellow physicians, on the other hand, scored slightly lower, but the difference was not significant, suggesting that their performance in overall diagnostic and therapeutic tasks remains competitive.</p>
        <p>In the evaluation of radiological diagnostic performance, ChatGPT-4o scored 2.92. A significant difference was observed between ChatGPT-4o and resident physicians (<italic>P</italic>=.04), and a significant difference was also detected between ChatGPT-4o and fellow physicians (<italic>P</italic>&#60;.05). ChatGPT-4 achieved a score of 2.79, surpassing all physician groups on average. However, this difference reached statistical significance solely in the comparison with fellow physicians (<italic>P</italic>&#60;.05). The difference between ChatGPT-4o mini and the fellow physicians was statistically significant (<italic>P</italic>=.01). Claude 3 Opus and Claude 3.5 Sonnet both scored 2.75, higher than all physician groups; in particular, the difference between these LLMs and fellow physicians was statistically significant (<italic>P</italic>&#60;.05), suggesting superior performance in radiological diagnostics. Gemini 1.5 Pro scored 2.50, showing small differences compared with attending physicians and resident physicians but significantly higher than fellow physicians (<italic>P</italic>=.01). Tongyi Qianwen 2.5 and ChatGLM scored lower, with no statistically significant differences compared to physician groups (<italic>P</italic>&#62;.05). Ernie Bot 3.5 had the lowest score (1.58) among all LLMs, with no statistical significance compared to any physician group, indicating relatively poorer performance. We conclude that the ChatGPT and Claude series outperform junior medical doctors in addressing complex medical issues, highlighting their potential clinical utility. However, ChatGLM and Ernie Bot 3.5 may require further optimization to improve their performance in medical diagnostics. In the diagnostic radiology domain, the average performance of attending physicians and resident physicians was comparable, whereas the small group of fellow physicians scored lower on average. It is noteworthy that resident physicians, who have recently completed standardized residency training, may be more familiar with the specific guidelines on which the questions were based. Attending physicians, benefiting from extensive clinical experience, would be expected to exhibit an advantage in diagnosing complex imaging findings. The performance variation across the small physician subgroups underscores the preliminary nature of these comparisons.</p>
        <p>In the assessment of clinical diagnosis and treatment performance, ChatGPT-4o mini achieved the highest score of 3 among all LLMs, although no statistically significant differences were found when compared with any physician group. ChatGPT-4o scored 2.92 and showed no significant difference compared with physician groups (<italic>P</italic>&#62;.05); however, it outperformed all physician groups on average. Other LLMs, including ChatGPT-4, Claude 3 Opus, and Claude 3.5 Sonnet, exhibited similar performance to physician groups. ChatGLM and Ernie Bot 3.5 showed weaker performance. Overall, the ChatGPT and Claude series of LLMs outperformed some physician groups with lower seniority in complex health care challenges, indicating potential for clinical applications. Further optimization may be necessary for ChatGLM and Ernie Bot 3.5 to enhance their diagnostic performance. Scores between attending physicians and fellow physicians were identical, while resident physicians scored slightly lower; however, these differences did not reach statistical significance. Radiologists of varying experience levels may possess adequate knowledge of clinical guidelines in diagnosis and treatment, particularly when dealing with standardized procedures or common diseases, resulting in minimal differences. Despite differences in experience, attending physicians and fellow physicians demonstrated similar proficiency in applying clinical guidelines.</p>
        <p>It is noteworthy that Tongyi Qianwen 2.5 was the only model unable to generate a response to a specific question, instead providing the feedback, “I’m very sorry, but I don’t think I fully understand what you mean. Let’s change the subject first, shall we?” This issue may be attributable to the complexity and specialized nature of the question, or potential biases and gaps in the model’s training data, leading to insufficient knowledge in specific domains. Our data indicate that when Tongyi Qianwen 2.5 fails to respond, the model’s accuracy may be overstated, and its actual performance is significantly below expectations.</p>
      </sec>
      <sec>
        <title>Comparison With Previous Work</title>
        <p>In our analysis of the 9 language models, Ernie Bot 3.5, Tongyi Qianwen 2.5, and ChatGLM were more likely to recommend referring the questions to medical experts when faced with specialized imaging terminology and clinical scenarios, rather than providing a definitive answer. In contrast, Claude 3 Opus and the ChatGPT series provided more detailed diagnostic imaging feedback, including recommendations for additional biopsies to support diagnosis and an emphasis on multidisciplinary collaboration. Regarding clinical considerations for surgery, the ChatGPT series excelled by offering surgical recommendations for breast cancer patients, while also demonstrating conventional surgical approaches and discussing recent therapeutic advancements. Furthermore, the ChatGPT family outperformed other language models in breast cancer diagnosis and treatment, with each model excelling in its respective area of expertise. This finding highlights the critical need to select the most appropriate language model based on task-specific performance, thereby enhancing problem-solving across various scenarios. These interactive language models provided feedback significantly faster than breast cancer radiologists, reducing time costs and increasing productivity. With continued iteration and improvement, ChatGPT has demonstrated considerable potential in radiology, including concise report generation, support for medical education, clinical decision aids, patient communication optimization, and data analysis [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Analyzing the reasons for comparable performance between leading language models reveals several key advantages: (1) large-scale, high-quality training data: both the developers of Claude 3 Opus and Chat GPT have invested substantial resources in acquiring and curating authoritative medical literature, textbooks, and specialized professional databases; (2) comparable model scale and architecture: as state-of-the-art LLMs, both likely use similar parameter scales and analogous underlying architectural designs; (3) equivalent alignment methodologies: both organizations have likely implemented similar reinforcement learning from human feedback techniques, using extensive human evaluator input to enhance the accuracy of model outputs; (4) competition-driven enhancement: market competition has compelled both companies to continuously improve model performance, particularly in high-value vertical domains such as health care. This competitive environment has resulted in both products demonstrating enhanced logical reasoning capabilities; (5) domain-specific knowledge processing: both models have likely incorporated specialized training methodologies for processing technical terminology and complex semantic relationships within specialized conceptual frameworks; (6) parallel safety control mechanisms: in high-risk domains such as medicine, both models have presumably implemented stringent output controls and prudence measures to ensure responsible performance. Given the continuous evolution of LLMs, their capabilities in complex domains, particularly medicine, are rapidly advancing. Future research should further investigate the potential of these models to maximize their value in clinical diagnosis and treatment [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>However, LLMs were not specifically trained for the medical field and rely on large datasets collected from public sources, such as web pages, books, and websites. Chinese LLMs have reported average accuracy in general medical question-answering of around 82%; yet, performance drops significantly in complex clinical reasoning tasks, such as breast cancer diagnosis (below 50%) [<xref ref-type="bibr" rid="ref22">22</xref>]. Advanced models, such as ChatGPT-3.5, ChatGPT-4.0, and Claude-2, achieve slightly higher performance (approximately 60%) in breast cancer clinical assessments, though this still highlights critical limitations in high-stakes medical applications [<xref ref-type="bibr" rid="ref23">23</xref>]. Despite the vast amount of training data, these models lack comprehensive common sense, which limits their reasoning ability. The literature suggests that LLM performance can be significantly enhanced by guiding reasoning steps incrementally, particularly in tasks involving longer reasoning chains [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. In our study, however, the 9 LLMs were unable to accurately diagnose all rigorous and logical medical problems, and a “curse of reversal” phenomenon was observed. Furthermore, unlike real-time search engines such as Google and Bing, LLMs can only generate responses based on their training data and do not have real-time access to current information [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. In addition, LLMs cannot reliably differentiate between factual and fictional statements, posing a potential risk as sources of misinformation in certain cases [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. Moreover, the training data for these models are not publicly available, and the sources of their information are not disclosed, making it difficult to verify the accuracy of their outputs [<xref ref-type="bibr" rid="ref31">31</xref>]. Therefore, radiologists must continue to rely on traditional evidence-based education and approach LLMs critically, cross-referencing their outputs with credible medical sources [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations. First, our findings are subject to the inherent limitations of the author-crafted, closed-ended question format. Although designed for objective assessment, this format may not fully capture the reasoning capabilities of LLMs and could potentially underestimate their performance in real-world, open-ended clinical scenarios. Second, the binary definition of a “confident” response as a direct “yes” or “no” may not adequately reflect clinical decision-making, where acknowledging uncertainty is a sign of expertise. This could have introduced bias by favoring LLMs that generate definitive answers over physicians who appropriately consider diagnostic nuances. Third, the physician sample size, while sufficient for initial comparisons, remains relatively small and may not be fully representative of the broader population of radiologists, which could affect the generalizability of our pairwise comparisons. Fourth, the study’s generalizability is constrained by its specific design: the assessment is limited to 50 selected questions that do not encompass all aspects of the guidelines, and the performance of the LLMs was evaluated on a single, specific task (answering structured concordance questions). Consequently, the results may not be directly extrapolated to other clinical datasets or different natural language processing tasks. Fifth, the study conditions, including a strict 30-minute time limit and a “no-consultation” rule for the participating physicians, may not reflect real-world clinical practice, where complex cases often benefit from more time and collaborative discussion. These methodological choices, while necessary for standardization, may have disadvantaged the physicians and thus the conclusions of this study must be heavily qualified by this context. In addition, it has been reported that LLMs may generate different responses to identical prompts at different times, due to model updates [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. Future studies should use larger and more diverse physician cohorts, incorporate more nuanced assessments of confidence, explore the impact of different time constraints and collaborative settings, and use a broader range of question types and clinical tasks to validate and extend our findings.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>In medical AI applications, LLMs participating in clinical decision-making face critical responsibility challenges, including decision output liability, misinformation risks, and privacy breach accountability [<xref ref-type="bibr" rid="ref30">30</xref>]. Typically, LLMs do not independently generate decision outputs but function as tools for physicians. Clinicians must integrate guidelines, relevant literature, and their clinical expertise to formulate final clinical decisions. When using LLMs, it is essential to conceal patients’ identifying information to safeguard privacy.</p>
        <p>This necessitates establishing explicit legal frameworks that delineate responsibility boundaries among developers, health care institutions, and physicians, while constructing comprehensive accountability mechanisms [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. Concurrently, LLMs present a risk of generating inaccurate medical information that could potentially mislead diagnostic and therapeutic decisions. To address this concern, model developers must implement rigorous content review protocols and ensure professional, authoritative verification of training corpora.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, within the context of our study, Claude 3 Opus and the ChatGPT series, particularly ChatGPT-4o, excelled in addressing breast cancer–related guideline questions and demonstrated significant potential for clinical application. In contrast, the less consistent performance of other LLMs, especially Ernie Bot 3.5 and ChatGLM, suggests that both confidence and consistency should be carefully considered when selecting and applying LLMs to ensure efficacy and stability in clinical practice. This preliminary evaluation indicates that LLMs can provide radiologists with extensive cross-disciplinary knowledge, potentially enhancing contributions within MDTs. However, they cannot fully replace human expertise, particularly in complex diagnostic scenarios that require nuanced decision-making honed by clinical experience.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Scores of large language models (LLMs) and physician groups on 50 questions, with comparison analysis.</p>
        <media xlink:href="medinform_v14i1e68182_app1.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 14 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Comparison of accuracy scores between large language models (LLMs) and physicians across 2 domains.</p>
        <media xlink:href="medinform_v14i1e68182_app2.docx" xlink:title="DOCX File , 23 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Scores of large language models (LLMs) and physician groups on 50 questions, including raw <italic>P</italic> values for comparison.</p>
        <media xlink:href="medinform_v14i1e68182_app3.docx" xlink:title="DOCX File , 20 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Specific responses from 9 large language models (LLMs) to the 50 questions (each question asked 3 times) and the corresponding physicians’ responses.</p>
        <media xlink:href="medinform_v14i1e68182_app4.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 854 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ACR BI-RADS</term>
          <def>
            <p>American College of Radiology Breast Imaging-Reporting and Data System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">MDT</term>
          <def>
            <p>multidisciplinary team</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NCCN</term>
          <def>
            <p>National Comprehensive Cancer Network</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We also sincerely thank Dr Siping Luo, Dr Xiao Luo, Huimei Lü, Zhichunmei Yuan, and Qiuxinmei Chen for their invaluable suggestions in designing the 50 breast cancer questions.</p>
    </ack>
    <notes>
      <title>Data Availability</title>
      <p>The datasets generated or analyzed during this study are available from the corresponding author upon reasonable request.</p>
    </notes>
    <notes>
      <title>Funding</title>
      <p>This research was funded by the Department of Science and Technology of Guangdong Province (grant no 112175605105) and the research project “Preoperative Localization and Navigation of Non-Mass Breast Cancer with DCE-MRI and Ultrasound Fusion Imaging” (grant no 2023yjlcyj019), and the Shenzhen Basic Research Project (Natural Science Foundation; grant no JCYJ20220530150416036). The funding period was 3 years, starting in June 2023, with a total budget of CNY $35,615 (US $5000).</p>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>CY was responsible for conceptualization, developing the conceptual framework, and formulating the ideas for this study. HJ and CY prepared the original draft, including substantive translation, and contributed to manuscript creation. HJ, CY, JY, and CY participated in the review and editing of the manuscript, covering both prepublication and postpublication stages. Supervision of the study was provided by HJ, CY, and JY, who oversaw and took leadership responsibility for the planning and execution of research activities. Resource support, including the provision of breast cancer–related guideline materials, was provided by JY, HJ, and CY. Formal analysis was conducted by WZ, who applied statistical techniques to analyze the research data, while data curation, including management of LLM models and essential software code, was performed by WZ and SZ. Investigation, including data collection during the study, was carried out by GR, WW, and MW. Project administration was coordinated by JY, who ensured the smooth planning and execution of research activities. Funding acquisition was secured by HJ and JY.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bray</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Laversanne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ferlay</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Soerjomataram</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Jemal</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Global cancer statistics 2022: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title>
          <source>CA Cancer J Clin</source>
          <year>2024</year>
          <volume>74</volume>
          <issue>3</issue>
          <fpage>229</fpage>
          <lpage>263</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/10.3322/caac.21834"/>
          </comment>
          <pub-id pub-id-type="doi">10.3322/caac.21834</pub-id>
          <pub-id pub-id-type="medline">38572751</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <article-title>New global breast cancer initiative highlights renewed commitment to improve survival</article-title>
          <source>World Health Organization (WHO)</source>
          <year>2021</year>
          <month>3</month>
          <day>8</day>
          <access-date>2025-01-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/news/item/08-03-2021-new-global-breast-cancer-initiative-highlights-renewed-commitment-to-improve-survival">https://www.who.int/news/item/08-03-2021-new-global-breast-cancer-initiative-highlights-renewed-commitment-to-improve-survival</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Trapani</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ginsburg</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Fadelu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>NU</given-names>
            </name>
            <name name-style="western">
              <surname>Hassett</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ilbawi</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>BO</given-names>
            </name>
            <name name-style="western">
              <surname>Curigliano</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Global challenges and policy solutions in breast cancer control</article-title>
          <source>Cancer Treat Rev</source>
          <year>2022</year>
          <volume>104</volume>
          <fpage>102339</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ctrv.2022.102339</pub-id>
          <pub-id pub-id-type="medline">35074727</pub-id>
          <pub-id pub-id-type="pii">S0305-7372(22)00002-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Azadnajafabad</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Saeedi Moghaddam</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mohammadi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Delazar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rashedi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Baradaran</surname>
              <given-names>HR</given-names>
            </name>
            <name name-style="western">
              <surname>Mansourian</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Patterns of better breast cancer care in countries with higher human development index and healthcare expenditure: insights from GLOBOCAN 2020</article-title>
          <source>Front Public Health</source>
          <year>2023</year>
          <volume>11</volume>
          <fpage>1137286</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37124828"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fpubh.2023.1137286</pub-id>
          <pub-id pub-id-type="medline">37124828</pub-id>
          <pub-id pub-id-type="pmcid">PMC10130425</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hisham</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Liew</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Hamzah</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ho</surname>
              <given-names>GJ</given-names>
            </name>
          </person-group>
          <article-title>Why is there variation in the practice of evidence-based medicine in primary care? A qualitative study</article-title>
          <source>BMJ Open</source>
          <year>2016</year>
          <volume>6</volume>
          <issue>3</issue>
          <fpage>e010565</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmjopen.bmj.com/lookup/pmidlookup?view=long&#38;pmid=26962037"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2015-010565</pub-id>
          <pub-id pub-id-type="medline">26962037</pub-id>
          <pub-id pub-id-type="pii">bmjopen-2015-010565</pub-id>
          <pub-id pub-id-type="pmcid">PMC4785283</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Regional disparities and influencing factors of high quality medical resources distribution in China</article-title>
          <source>Int J Equity Health</source>
          <year>2023</year>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>8</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://equityhealthj.biomedcentral.com/articles/10.1186/s12939-023-01825-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12939-023-01825-6</pub-id>
          <pub-id pub-id-type="medline">36627636</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12939-023-01825-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC9832614</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zuhair</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Babar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Oduoye</surname>
              <given-names>MO</given-names>
            </name>
            <name name-style="western">
              <surname>Noor</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chris</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Okon</surname>
              <given-names>II</given-names>
            </name>
            <name name-style="western">
              <surname>Rehman</surname>
              <given-names>LU</given-names>
            </name>
          </person-group>
          <article-title>Exploring the impact of artificial intelligence on global health and enhancing healthcare in developing nations</article-title>
          <source>J Prim Care Community Health</source>
          <year>2024</year>
          <volume>15</volume>
          <fpage>21501319241245847</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/21501319241245847?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/21501319241245847</pub-id>
          <pub-id pub-id-type="medline">38605668</pub-id>
          <pub-id pub-id-type="pmcid">PMC11010755</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Geras</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Moy</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence for mammography and digital breast tomosynthesis: current concepts and future perspectives</article-title>
          <source>Radiology</source>
          <year>2019</year>
          <volume>293</volume>
          <issue>2</issue>
          <fpage>246</fpage>
          <lpage>259</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31549948"/>
          </comment>
          <pub-id pub-id-type="doi">10.1148/radiol.2019182627</pub-id>
          <pub-id pub-id-type="medline">31549948</pub-id>
          <pub-id pub-id-type="pmcid">PMC6822772</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhayana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bleakney</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on a radiology board-style examination: insights into current strengths and limitations</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230582</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230582</pub-id>
          <pub-id pub-id-type="medline">37191485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maroncelli</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rizzo</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Pasculli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cicciarelli</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Macera</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Galati</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Catalano</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Pediconi</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Probing clarity: AI-generated simplified breast imaging reports for enhanced patient comprehension powered by ChatGPT-4o</article-title>
          <source>Eur Radiol Exp</source>
          <year>2024</year>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>124</fpage>
          <pub-id pub-id-type="doi">10.1186/s41747-024-00526-1</pub-id>
          <pub-id pub-id-type="medline">39477904</pub-id>
          <pub-id pub-id-type="pii">10.1186/s41747-024-00526-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC11525358</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeblick</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Schachtner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dexl</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mittermeier</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stüber</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Topalis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wesp</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sabel</surname>
              <given-names>BO</given-names>
            </name>
            <name name-style="western">
              <surname>Ricke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ingrisch</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT makes medicine easy to swallow: an exploratory case study on simplified radiology reports</article-title>
          <source>Eur Radiol</source>
          <year>2024</year>
          <volume>34</volume>
          <issue>5</issue>
          <fpage>2817</fpage>
          <lpage>2825</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37794249"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00330-023-10213-1</pub-id>
          <pub-id pub-id-type="medline">37794249</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00330-023-10213-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC11126432</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lukac</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dayan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Fink</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Leinert</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hartkopf</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Veselinovic</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Janni</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rack</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Pfister</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Heitmeir</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ebner</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Evaluating ChatGPT as an adjunct for the multidisciplinary tumor board decision-making in primary breast cancer cases</article-title>
          <source>Arch Gynecol Obstet</source>
          <year>2023</year>
          <volume>308</volume>
          <issue>6</issue>
          <fpage>1831</fpage>
          <lpage>1844</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37458761"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00404-023-07130-5</pub-id>
          <pub-id pub-id-type="medline">37458761</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00404-023-07130-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10579162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schulte</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Capacity of ChatGPT to identify guideline-based treatments for advanced solid tumors</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <volume>15</volume>
          <issue>4</issue>
          <fpage>e37938</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37220429"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.37938</pub-id>
          <pub-id pub-id-type="medline">37220429</pub-id>
          <pub-id pub-id-type="pmcid">PMC10200252</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berglund</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tong</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaufmann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Balesni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stickland</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Korbak</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>The reversal curse: LLMs trained on "A is B" fail to learn "B is A"</article-title>
          <source>arXiv. Preprint posted online on Sep 21, 2023</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2309.12288</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eaton</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Zambrano</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sanabria</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kyei</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Mra</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sarchet</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kremzier</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Borras</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Aung</surname>
              <given-names>TK</given-names>
            </name>
            <name name-style="western">
              <surname>Morton Doherty</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Henshall</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Camacho</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Innovative methodology for strengthening a multidisciplinary team approach in cities in low- and middle-income countries</article-title>
          <source>JCO Glob Oncol</source>
          <year>2022</year>
          <volume>8</volume>
          <fpage>e2200149</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36252159"/>
          </comment>
          <pub-id pub-id-type="doi">10.1200/GO.22.00149</pub-id>
          <pub-id pub-id-type="medline">36252159</pub-id>
          <pub-id pub-id-type="pmcid">PMC9812446</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bi</surname>
              <given-names>WL</given-names>
            </name>
            <name name-style="western">
              <surname>Hosny</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schabath</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>Giger</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Birkbak</surname>
              <given-names>NJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mehrtash</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Allison</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Arnaout</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Abbosh</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dunn</surname>
              <given-names>IF</given-names>
            </name>
            <name name-style="western">
              <surname>Mak</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Tamimi</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Tempany</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Swanton</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffmann</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Gillies</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>RY</given-names>
            </name>
            <name name-style="western">
              <surname>Aerts</surname>
              <given-names>HJWL</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in cancer imaging: clinical challenges and applications</article-title>
          <source>CA Cancer J Clin</source>
          <year>2019</year>
          <volume>69</volume>
          <issue>2</issue>
          <fpage>127</fpage>
          <lpage>157</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30720861"/>
          </comment>
          <pub-id pub-id-type="doi">10.3322/caac.21552</pub-id>
          <pub-id pub-id-type="medline">30720861</pub-id>
          <pub-id pub-id-type="pmcid">PMC6403009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmood</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Malem</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Foster</surname>
              <given-names>WP</given-names>
            </name>
            <name name-style="western">
              <surname>Sanghera</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>YL</given-names>
            </name>
            <name name-style="western">
              <surname>Chong</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shakeel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>BKJ</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Rauz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSJ</given-names>
            </name>
          </person-group>
          <article-title>Large language models approach expert-level clinical knowledge and reasoning in ophthalmology: a head-to-head cross-sectional study</article-title>
          <source>PLOS Digit Health</source>
          <year>2024</year>
          <volume>3</volume>
          <issue>4</issue>
          <fpage>e0000341</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38630683"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000341</pub-id>
          <pub-id pub-id-type="medline">38630683</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-23-00293</pub-id>
          <pub-id pub-id-type="pmcid">PMC11023493</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Heacock</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elias</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hentel</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Reig</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Shih</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Moy</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and  other large language models are double-edged swords</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>307</volume>
          <issue>2</issue>
          <fpage>e230163</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230163</pub-id>
          <pub-id pub-id-type="medline">36700838</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lecler</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Duron</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Soyer</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Revolutionizing radiology with GPT-based models: current applications, future possibilities and limitations of ChatGPT</article-title>
          <source>Diagn Interv Imaging</source>
          <year>2023</year>
          <volume>104</volume>
          <issue>6</issue>
          <fpage>269</fpage>
          <lpage>274</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2211-5684(23)00027-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.diii.2023.02.003</pub-id>
          <pub-id pub-id-type="medline">36858933</pub-id>
          <pub-id pub-id-type="pii">S2211-5684(23)00027-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Qureshi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Irfan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zafar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shaikh</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A survey on large language models: applications, challenges, limitations, and practical usage</article-title>
          <source>arXiv. Preprint posted online on Jul 19, 2023</source>
          <year>2023</year>
          <comment>Unreviewed arXiv preprint (arXiv:2307.10169); not formally peer-reviewed or published<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2307.10169"/></comment>
          <pub-id pub-id-type="doi">10.36227/techrxiv.23589741.v1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Shanghai Artificial Intelligence Laboratory</collab>
            <collab>China Software Testing Center (CSTC)</collab>
          </person-group>
          <article-title>Analysis Report on the Development of Large Models in the Healthcare Field (2024)</article-title>
          <source>WeChat Official Account (Shanghai Artificial Intelligence Laboratory)</source>
          <year>2024</year>
          <publisher-name>Shanghai Artificial Intelligence Laboratory</publisher-name>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mp.weixin.qq.com/s/XCd5FiW2DacnfYXn1XdkSQ">https://mp.weixin.qq.com/s/XCd5FiW2DacnfYXn1XdkSQ</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>T</given-names>
            </name>
            <collab>Yangzhang</collab>
            <name name-style="western">
              <surname>Zhai</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of large language models in breast cancer clinical scenarios: a comparative analysis based on ChatGPT-3.5, ChatGPT-4.0, and Claude2</article-title>
          <source>Int J Surg</source>
          <year>2024</year>
          <volume>110</volume>
          <issue>4</issue>
          <fpage>1941</fpage>
          <lpage>1950</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38668655"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/JS9.0000000000001066</pub-id>
          <pub-id pub-id-type="medline">38668655</pub-id>
          <pub-id pub-id-type="pii">01279778-202404000-00008</pub-id>
          <pub-id pub-id-type="pmcid">PMC11019981</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>The impact of reasoning step length on large language models</article-title>
          <source>arXiv. Preprit posted online on Jan 10, 2024</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2401.04925</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Duan</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>CRITIC: large language models can self-correct with tool-interactive critiquing</article-title>
          <source>Proceedings of the International Conference on Learning Representations (ICLR)</source>
          <year>2024</year>
          <conf-name>12th International Conference on Learning Representations (ICLR) 2024</conf-name>
          <conf-date>May 7–11, 2024</conf-date>
          <conf-loc>Vienna, Austria</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/forum?id=Sx038qxjek"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kamineni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lie</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Prasad</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Landman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dreyer</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Succi</surname>
              <given-names>MD</given-names>
            </name>
          </person-group>
          <article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e48659</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e48659/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48659</pub-id>
          <pub-id pub-id-type="medline">37606976</pub-id>
          <pub-id pub-id-type="pii">v25i1e48659</pub-id>
          <pub-id pub-id-type="pmcid">PMC10481210</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Patterson</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Wanderer</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Turer</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>McCoy</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Sittig</surname>
              <given-names>DF</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Using AI-generated suggestions from ChatGPT to optimize clinical decision support</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2023</year>
          <volume>30</volume>
          <issue>7</issue>
          <fpage>1237</fpage>
          <lpage>1245</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37087108"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocad072</pub-id>
          <pub-id pub-id-type="medline">37087108</pub-id>
          <pub-id pub-id-type="pii">7136722</pub-id>
          <pub-id pub-id-type="pmcid">PMC10280357</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Simon</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>Altay</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mercier</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Misinformation reloaded? Fears about the impact of generative AI on misinformation are overblown</article-title>
          <source>HKS Misinfo Review</source>
          <year>2023</year>
          <day>5</day>
          <volume>4</volume>
          <issue>5</issue>
          <fpage>N/A</fpage>
          <lpage>N/A</lpage>
          <pub-id pub-id-type="doi">10.37016/mr-2023-054</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Augenstein</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Baldwin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Cha</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chakraborty</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ciampaglia</surname>
              <given-names>GL</given-names>
            </name>
            <name name-style="western">
              <surname>Corney</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>DiResta</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ferrara</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hale</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Halevy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hovy</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Menczer</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Miguez</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Nakov</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Scheufele</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zagni</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Factuality challenges in the era of large language models and opportunities for fact-checking</article-title>
          <source>Nat Mach Intell</source>
          <year>2024</year>
          <volume>6</volume>
          <issue>8</issue>
          <fpage>852</fpage>
          <lpage>863</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nature.com/articles/s42256-024-00881-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s42256-024-00872-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haltaufderheide</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ranisch</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The ethics of ChatGPT in medicine and healthcare: a systematic review on large language models (LLMs)</article-title>
          <source>NPJ Digit Med</source>
          <year>2024</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>183</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-024-01157-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-024-01157-x</pub-id>
          <pub-id pub-id-type="medline">38977771</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-024-01157-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC11231310</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bernal</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mazo</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Transparency of artificial intelligence in healthcare: insights from professionals in computing and healthcare worldwide</article-title>
          <source>Appl Sci</source>
          <year>2022</year>
          <month>10</month>
          <day>11</day>
          <volume>12</volume>
          <issue>20</issue>
          <fpage>10228</fpage>
          <pub-id pub-id-type="doi">10.3390/app122010228</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mello</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Guha</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and physicians' malpractice risk</article-title>
          <source>JAMA Health Forum</source>
          <year>2023</year>
          <volume>4</volume>
          <issue>5</issue>
          <fpage>e231938</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jamanetwork.com/article.aspx?doi=10.1001/jamahealthforum.2023.1938"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamahealthforum.2023.1938</pub-id>
          <pub-id pub-id-type="medline">37200013</pub-id>
          <pub-id pub-id-type="pii">2805334</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kahng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tenney</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Pushkarna</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wexler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reif</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kallaracka</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Terry</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>LLM comparator: visual analytics for side-by-side evaluation of large language models</article-title>
          <source>Extended Abstracts of the CHI Conference on Human Factors in Computing Systems (CHI EA '24)</source>
          <year>2024</year>
          <conf-name>Extended Abstracts of the CHI Conference on Human Factors in Computing Systems (CHI EA '24)</conf-name>
          <conf-date>May 11-16, 2024</conf-date>
          <conf-loc>Honolulu, HI</conf-loc>
          <fpage>1</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.1145/3613905.3650755</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ferdaus</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Abdelguerfi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Loup</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>N. Niles</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pathak</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sloan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Towards trustworthy AI: a review of ethical and robust large language models</article-title>
          <source>arXiv. Preprint posted online on Jun 1, 2024</source>
          <year>2024</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2407.13934"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bouhouita-Guermech</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Haidar</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Scoping review shows the dynamics and complexities inherent to the notion of "Responsibility" in artificial intelligence within the healthcare context</article-title>
          <source>Asian Bioeth Rev</source>
          <year>2024</year>
          <volume>16</volume>
          <issue>3</issue>
          <fpage>315</fpage>
          <lpage>344</lpage>
          <pub-id pub-id-type="doi">10.1007/s41649-024-00292-7</pub-id>
          <pub-id pub-id-type="medline">39022380</pub-id>
          <pub-id pub-id-type="pii">292</pub-id>
          <pub-id pub-id-type="pmcid">PMC11250714</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
