<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e66917</article-id><article-id pub-id-type="doi">10.2196/66917</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Benchmarking the Confidence of Large Language Models in Answering Clinical Questions: Cross-Sectional Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Omar</surname><given-names>Mahmud</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Agbareia</surname><given-names>Reem</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Glicksberg</surname><given-names>Benjamin S</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Nadkarni</surname><given-names>Girish N</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Klang</surname><given-names>Eyal</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Division of Data-Driven and Digital Medicine (D3M), Department of Medicine, Icahn School of Medicine at Mount Sinai</institution><addr-line>Gustave L. Levy Place New York</addr-line><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff2"><institution>Ophthalmology Department, Hadassah Medical Center</institution><addr-line>Jerusalem</addr-line><country>Israel</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Castonguay</surname><given-names>Alexandre</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Sharma</surname><given-names>Deepika</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Arasteh</surname><given-names>Soroosh Tayebi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mahmud Omar, MD, Division of Data-Driven and Digital Medicine (D3M), Department of Medicine, Icahn School of Medicine at Mount Sinai, Gustave L. Levy Place New York, New York, NY, 10029, United States, 1 212 241 6500; <email>mahmudomar70@gmail.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>16</day><month>5</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e66917</elocation-id><history><date date-type="received"><day>26</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>31</day><month>01</month><year>2025</year></date><date date-type="accepted"><day>31</day><month>01</month><year>2025</year></date></history><copyright-statement>&#x00A9; Mahmud Omar, Reem Agbareia, Benjamin S Glicksberg, Girish N Nadkarni, Eyal Klang. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 16.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e66917"/><abstract><sec><title>Background</title><p>The capabilities of large language models (LLMs) to self-assess their own confidence in answering questions within the biomedical realm remain underexplored.</p></sec><sec><title>Objective</title><p>This study evaluates the confidence levels of 12 LLMs across 5 medical specialties to assess LLMs&#x2019; ability to accurately judge their own responses.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used 1965 multiple-choice questions that assessed clinical knowledge in the following areas: internal medicine, obstetrics and gynecology, psychiatry, pediatrics, and general surgery. Models were prompted to provide answers and to also provide their confidence for the correct answers (score: range 0%&#x2010;100%). We calculated the correlation between each model&#x2019;s mean confidence score for correct answers and the overall accuracy of each model across all questions. The confidence scores for correct and incorrect answers were also analyzed to determine the mean difference in confidence, using 2-sample, 2-tailed <italic>t</italic> tests.</p></sec><sec sec-type="results"><title>Results</title><p>The correlation between the mean confidence scores for correct answers and model accuracy was inverse and statistically significant (<italic>r</italic>=&#x2212;0.40; <italic>P</italic>=.001), indicating that worse-performing models exhibited paradoxically higher confidence. For instance, a top-performing model&#x2014;GPT-4o&#x2014;had a mean accuracy of 74% (SD 9.4%), with a mean confidence of 63% (SD 8.3%), whereas a low-performing model&#x2014;Qwen2-7B&#x2014;showed a mean accuracy of 46% (SD 10.5%) but a mean confidence of 76% (SD 11.7%). The mean difference in confidence between correct and incorrect responses was low for all models, ranging from 0.6% to 5.4%, with GPT-4o having the highest mean difference (5.4%, SD 2.3%; <italic>P</italic>=.003).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Better-performing LLMs show more aligned overall confidence levels. However, even the most accurate models still show minimal variation in confidence between right and wrong answers. This may limit their safe use in clinical settings. Addressing overconfidence could involve refining calibration methods, performing domain-specific fine-tuning, and involving human oversight when decisions carry high risks. Further research is needed to improve these strategies before broader clinical adoption of LLMs.</p></sec></abstract><kwd-group><kwd>safe AI</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>algorithm</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>natural language processing</kwd><kwd>NLP</kwd><kwd>deep learning</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>With their capacity to understand and generate human-like text, large language models (LLMs) are poised to support health care professionals in complex clinical decisions [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. A wide array of LLMs is now accessible, including open-source models, offering solutions that cater to both the public and medical professionals [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>The efficacy of these models has been demonstrated in a variety of tasks, albeit with some limitations [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. For instance, LLMs, such as GPT, have shown promise in providing diagnostic assistance and answering medical queries [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. Katz et al [<xref ref-type="bibr" rid="ref10">10</xref>] demonstrated that GPT-4 not only improved clinically when compared to its predecessor, GPT-3.5, but also matched physician performance in certain areas. However, there is evidence of hallucinations and inaccuracies in model outputs, which could lead to harm in clinical decision-making [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Specifically, LLMs have occasionally generated completely fabricated evidence (eg, information and references) and have presented such evidence as factual [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>One way of building confidence in applying models within health care is the use of explainable artificial intelligence (AI) [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. However, easily explainable outputs are difficult to evaluate due to the complexity of how LLMs process and output data [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Recent work revealed that these models often exhibit high confidence even when presenting incorrect information [<xref ref-type="bibr" rid="ref17">17</xref>]. This raises questions about the underlying mechanisms that prompt an LLM to label certain statements as &#x201C;more factual.&#x201D; For example, one possible explanation could be that data-rich or frequently discussed topics in training sets may be perceived as more certain [<xref ref-type="bibr" rid="ref18">18</xref>], even if this does not translate into clinical accuracy. Additionally, retrieval-augmented generation (RAG) has been proposed to ground LLM outputs in external data, which potentially mitigates hallucinations [<xref ref-type="bibr" rid="ref19">19</xref>]. Nevertheless, these approaches do not fully resolve whether models can reliably judge their own correctness. Accurate and well-calibrated confidence scores may be vital for establishing trust in these systems, as such scores can alert users to approach certain responses with caution. If a model consistently shows undue confidence in wrong answers, it poses a subtle but potentially dangerous form of hallucination. Clinicians might adopt decisions based on erroneous advice that is delivered with overt certainty. By investigating how these models generate and express their confidence, we aimed to illuminate whether LLMs can reliably self-assess correctness.</p><p>The goal of this study was to benchmark LLMs (both proprietary LLMs, like GPT-4o and Claude 3.5 Sonnet, and open-source LLMs, like Qwen) in terms of accuracy and associated confidence in answering clinical questions. Our aim was to determine if these models can accurately judge when to be confident in their responses and, in doing so, allow for better explainability in their application.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Data Source</title><p>This study used a public compiled dataset from a previous study by Katz et al [<xref ref-type="bibr" rid="ref10">10</xref>], which includes 655 questions for the following five medical specialties: internal medicine, obstetrics and gynecology (OBGYN), psychiatry, pediatrics, and general surgery. These questions were sourced from official 2023 licensing examinations for each field and were crafted from internationally recognized textbooks and guidelines. This dataset serves as a standardized framework for assessment [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>To enhance benchmarking reliability, each original question was rephrased twice by using the GPT-4 application programming interface (API) in Python (Python Software Foundation), yielding 1965 questions (we include the full prompt in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The prompts were carefully designed to modify only the writing style, without altering any clinical details, such as medical terms, laboratory values, or answer choices [<xref ref-type="bibr" rid="ref25">25</xref>]. This approach aimed to preserve all clinical details, ensuring that rephrased questions stayed faithful to the original intent and information. To confirm this, 2 board-certified physicians separately reviewed a 20% random sample of questions from each specialty. They compared the rephrased and original questions side by side, focusing on consistency in medical terminology, laboratory values, and answer choices. Both reviewers concluded that the paraphrased items remained unchanged in terms of clinical meaning and required no further edits, thereby confirming overall integrity and accuracy.</p></sec><sec id="s2-2"><title>Model Setup and Configuration</title><p>The LLMs used in this study were prompted (using 1 structured prompt) to return the correct answer, along with a confidence score for each choice (&#x201C;A,&#x201D; &#x201C;B,&#x201D; &#x201C;C,&#x201D; and &#x201C;D&#x201D;), in JSON format. These confidence scores were expressed as percentages between 0% and 100% for each option, resulting in a total confidence score of 100% for all options combined. The open access models were executed by using API codes in a dedicated server with 4 H100 80-GB graphics processing units; the corresponding codebase is accessible on GitHub for the original database by Katz et al [<xref ref-type="bibr" rid="ref10">10</xref>], and we provide the full prompts, which can be used locally, in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We used Python 3.10 for data analyses. The commercial models were used via the corresponding companies&#x2019; API interfaces. We used several Python libraries to facilitate data processing, model interaction, and analysis&#x2014;NumPy 1.26.4, Pandas 2.1.4, Scikit-Learn 1.3.0, Hugging Face&#x2019;s Transformers 4.37.2, and torch 2.2.2+cu121&#x2014;as well as JSON module 2.0.9. We used the default hyperparameters for each model to reflect typical user settings and provide a balanced baseline [<xref ref-type="bibr" rid="ref26">26</xref>]. For the open access models, we used the &#x201C;instruct&#x201D; versions, which perform better on zero-shot questioning.</p></sec><sec id="s2-3"><title>Benchmarked LLMs</title><p>We selected 12 LLMs that varied in terms of size, architectures, and intended domains (<xref ref-type="fig" rid="figure1">Figure 1</xref>). This set included established &#x201C;household&#x201D; models and newly introduced or domain-focused alternatives, ensuring diverse coverage (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The benchmarked models are shown in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>A flowchart representing the evaluation methodology. The 655 questions were sourced from a study by Katz et al [<xref ref-type="bibr" rid="ref10">10</xref>]. MCQ: multiple-choice question.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66917_fig01.png"/></fig></sec><sec id="s2-4"><title>Statistical Analysis</title><p>The Pearson correlation coefficient was used to correlate models&#x2019; mean confidence scores for correct answers and accuracies across models and medical fields. Chi-square tests assessed overall performance differences within each field, using proportions of correct responses. Post hoc pairwise comparisons with Bonferroni correction identified specific intermodel differences. Confidence levels were compared between correct and incorrect responses for each model, using 2-sample, 2-tailed <italic>t</italic> tests. Mean confidence scores were calculated for higher-tier and lower-tier models, as well as across all models. Performance consistency was evaluated by comparing confidence gaps between correct and incorrect responses. All statistical tests used a significance level of &#x03B1;=.05. Analyses were performed using R version 4.1.2 (R Foundation for Statistical Computing).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Confidence Analysis</title><p><xref ref-type="table" rid="table1">Table 1</xref> summarizes accuracies and confidence levels across the models, and Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> presents the data across all inspected fields and all models. An inverse correlation between the mean confidence scores for correct answers and the overall accuracy of the models is demonstrated (<italic>r</italic>=&#x2013;0.40; <italic>P</italic>=.001); better-performing models generally showed lower confidence.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Accuracies and confidence levels across the models.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy, %</td><td align="left" valign="bottom">Total confidence, %</td><td align="left" valign="bottom">Confidence for correct answer, %</td><td align="left" valign="bottom">Confidence for incorrect answer, %</td></tr></thead><tbody><tr><td align="left" valign="top">Claude 3.5 Sonnet</td><td align="left" valign="top">74</td><td align="left" valign="top">69.7</td><td align="left" valign="top">70.5</td><td align="left" valign="top">67.4</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">73.8</td><td align="left" valign="top">63</td><td align="left" valign="top">64.4</td><td align="left" valign="top">59</td></tr><tr><td align="left" valign="top">Claude 3 Opus</td><td align="left" valign="top">71.7</td><td align="left" valign="top">68.5</td><td align="left" valign="top">68.9</td><td align="left" valign="top">67.3</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">66</td><td align="left" valign="top">84.1</td><td align="left" valign="top">84.5</td><td align="left" valign="top">83.3</td></tr><tr><td align="left" valign="top">Llama-3-70B</td><td align="left" valign="top">63.4</td><td align="left" valign="top">57.3</td><td align="left" valign="top">59.5</td><td align="left" valign="top">53.6</td></tr><tr><td align="left" valign="top">Llama OpenBio</td><td align="left" valign="top">59.2</td><td align="left" valign="top">77.9</td><td align="left" valign="top">77.7</td><td align="left" valign="top">78.1</td></tr><tr><td align="left" valign="top">Gemini</td><td align="left" valign="top">59.1</td><td align="left" valign="top">86.5</td><td align="left" valign="top">87.2</td><td align="left" valign="top">85.5</td></tr><tr><td align="left" valign="top">Qwen2-72B</td><td align="left" valign="top">57.8</td><td align="left" valign="top">57.7</td><td align="left" valign="top">58.6</td><td align="left" valign="top">56.5</td></tr><tr><td align="left" valign="top">Mixtral-8&#x00D7;7B</td><td align="left" valign="top">50.6</td><td align="left" valign="top">84.3</td><td align="left" valign="top">85.5</td><td align="left" valign="top">83</td></tr><tr><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">49</td><td align="left" valign="top">82.3</td><td align="left" valign="top">81.6</td><td align="left" valign="top">82.9</td></tr><tr><td align="left" valign="top">Llama-3-8B</td><td align="left" valign="top">48.4</td><td align="left" valign="top">80</td><td align="left" valign="top">79.7</td><td align="left" valign="top">80.3</td></tr><tr><td align="left" valign="top">Qwen2-7B</td><td align="left" valign="top">46</td><td align="left" valign="top">75.5</td><td align="left" valign="top">74.4</td><td align="left" valign="top">76.4</td></tr></tbody></table></table-wrap><p>The mean confidence score for all 12 models was 76.1% when they were correct and 74.4% when they were incorrect. The 6 top-performing models showed a mean confidence score of 72.5% when they were correct and a mean confidence score of 69.4% when incorrect, while the 6 lowest-performing models displayed 79.6% confidence when they were correct and 79.5% confidence when they were incorrect (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Large language models&#x2019; mean confidence scores for correct and incorrect answers.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Confidence when incorrect (%), mean (SD)</td><td align="left" valign="bottom">Confidence when correct (%), mean (SD)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">58.99 (14.31)</td><td align="left" valign="top">64.38 (16.11)</td><td align="left" valign="top">.006</td></tr><tr><td align="left" valign="top">Llama-3-70B</td><td align="left" valign="top">53.59 (22.38)</td><td align="left" valign="top">59.50 (23.54)</td><td align="left" valign="top">.006</td></tr><tr><td align="left" valign="top">Claude 3.5 Sonnet</td><td align="left" valign="top">67.37 (9.08)</td><td align="left" valign="top">70.52 (11.07)</td><td align="left" valign="top">.003</td></tr><tr><td align="left" valign="top">Gemini</td><td align="left" valign="top">85.55 (16.23)</td><td align="left" valign="top">87.17 (16.58)</td><td align="left" valign="top">.35</td></tr><tr><td align="left" valign="top">Claude 3 Opus</td><td align="left" valign="top">67.32 (13.06)</td><td align="left" valign="top">68.90 (15.65)</td><td align="left" valign="top">.61</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">83.34 (23.30)</td><td align="left" valign="top">84.52 (22.43)</td><td align="left" valign="top">.07</td></tr><tr><td align="left" valign="top">Qwen2-72B</td><td align="left" valign="top">56.49 (18.55)</td><td align="left" valign="top">58.59 (20.03)</td><td align="left" valign="top">.004</td></tr><tr><td align="left" valign="top">Qwen2-7B</td><td align="left" valign="top">76.37 (17.11)</td><td align="left" valign="top">74.45 (20.30)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top">Mixtral-8&#x00D7;7B</td><td align="left" valign="top">82.99 (16.52)</td><td align="left" valign="top">85.49 (14.62)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top">Llama-3-8B</td><td align="left" valign="top">80.25 (17.40)</td><td align="left" valign="top">79.67 (21.59)</td><td align="left" valign="top">.31</td></tr><tr><td align="left" valign="top">Llama OpenBio</td><td align="left" valign="top">78.14 (27.59)</td><td align="left" valign="top">77.73 (28.78)</td><td align="left" valign="top">.83</td></tr><tr><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">82.85 (27.17)</td><td align="left" valign="top">81.63 (28.66)</td><td align="left" valign="top">.81</td></tr></tbody></table></table-wrap><p>Four models (GPT-4o, Llama-3-70B, Claude 3.5 Sonnet, and Qwen2-72B) demonstrated significantly higher confidence when they were correct (all <italic>P</italic> values were &#x003C;.01) across the different fields and subsets. Gemini exhibited the highest overall confidence levels (when incorrect: mean 85.6%, SD 16.2%; when correct: mean 87.2%, SD 16.6%). Qwen2-7B was unique in that it displayed higher confidence when incorrect (mean 76.4%, SD 17.1% vs mean 74.5%, SD 20.3% when correct; <italic>P</italic>=.01).</p><p>GPT-3.5 and Llama-OpenBio-70B revealed minimal differences in confidence between correct and incorrect answers (<italic>P</italic>=.80). The largest confidence gap was observed in GPT-4 (5.4%, SD 2.3%; <italic>P</italic>=.003), while Llama-3-8B had the smallest gap (0.6%; <xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Large language models&#x2019; confidence results for correct and incorrect answers. The left graph displays the average confidence and 95% CIs for each model, categorized by correct answers (green) and incorrect answers (red). The right graph shows the differences in average confidence for each model, where green indicates higher confidence in correct answers, and red indicates higher confidence in incorrect answers.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e66917_fig02.png"/></fig></sec><sec id="s3-2"><title>Models&#x2019; Performances Across Fields</title><p>Significant differences were seen in model performance across all 5 medical specialties (at the <italic>P</italic>&#x003C;.01 level). GPT-4o and Claude 3.5 Sonnet consistently outperformed other models. For internal medicine, GPT-4o (accuracy: 70.9%) and Claude 3.5 Sonnet (accuracy: 73.5%) showed no significant difference (<italic>P</italic>&#x003E;.99) but outperformed lower-tier models, such as Qwen-7b (accuracy: 43.7%; <italic>P</italic>&#x003C;.001). For OBGYN, Claude 3.5 Sonnet (accuracy: 71.0%) significantly outperformed most models, including GPT-4 (accuracy: 54.0%; <italic>P</italic>&#x003C;.001). For pediatrics, the top 5 models (GPT-4o, Llama-3-70b, Claude 3.5 Sonnet, Claude 3 Opus, and GPT-4) showed no significant differences among themselves (all <italic>P</italic> values were &#x003E;.05) but outperformed lower-tier models. Psychiatry results mirrored this pattern, with GPT-4o (accuracy: 84.4%) and Claude 3.5 Sonnet (accuracy: 82.4%) showing the best performance. For surgery, GPT-4o (accuracy: 70.9%) and Claude 3.5 Sonnet (accuracy: 70.5%) again showed no significant difference (<italic>P</italic>&#x003E;.99) but outperformed lower-performing models, such as Qwen-7b (accuracy: 45.6%; <italic>P</italic>&#x003C;.01; Tables S3 and S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>In our evaluation, accuracy and confidence were inversely correlated for LLMs. Some lower-complexity models were notably more confident in incorrect answers. Despite GPT-4o showing the best performance, its largest observed gap between confidence scores for correct and incorrect answers was only 5.4%. This indicates that it may be insufficient for reliably guiding clinical choices, although the difference was statistically significant, and the model&#x2019;s confidence levels for correct and incorrect responses were generally high. Consequently, this gap does not provide a meaningful threshold for differentiating safe decision-making from potentially harmful decision-making in real-world practice. These results highlight potential risks in clinical applications, where model confidence, regardless of answer correctness, could lead to misinformed decisions.</p><p>We think that the observed miscalibration between correctness and confidence may pose risks in daily clinical practice if it remains unresolved. Overconfident models may recommend unsafe dosages or overlook key signs in a patient&#x2019;s presentation, especially under the fast-paced pressures of modern practice. This could lead to incorrect prescriptions or treatments. For example, the model might prescribe an incorrect antibiotic for a resistant infection, thereby delaying proper care. In other cases, a model&#x2019;s unwarranted confidence in a wrong triage decision could divert urgent attention from a critical patient. Such errors can increase morbidity and may undermine trust in AI-assisted clinical tools.</p><p>A brief comparison across models of various sizes did not reveal a consistent relationship between model size and confidence gaps. For instance, Qwen2-72B showed about a 2% difference in confidence between correct and incorrect responses, while Qwen2-7B exhibited a similarly small difference. This pattern was noted across multiple specialties, suggesting that architecture or domain-specific factors may play a more pivotal role than sheer model size in determining confidence behaviors.</p><p>Katz et al [<xref ref-type="bibr" rid="ref10">10</xref>] reported that GPT-4 outperformed physicians in psychiatry and performed comparably to physicians in general surgery and internal medicine. Our study corroborates GPT-4&#x2019;s strong performance, particularly in psychiatry, where GPT-4o achieved 84.4% accuracy. However, our findings suggest that more cautious interpretation is needed, given the high confidence levels observed for incorrect answers. Xiong et al&#x2019;s [<xref ref-type="bibr" rid="ref17">17</xref>] work on LLM confidence elicitation aligns with our observations of overconfidence. They noted improved calibration and failure prediction as model capability increased, which parallels our finding of better confidence calibration in more complex models.</p><p>If prompted confidence scores are truly driven by a model&#x2019;s internal representations and are not random or uncontextualized outputs, then consistently arbitrary numbers would suggest a disconnect between the model&#x2019;s knowledge state and its confidence estimates. Such misalignment can arise if the model&#x2019;s architecture, training data, or prompting strategies do not calibrate confidence with genuine certainty [<xref ref-type="bibr" rid="ref17">17</xref>]. In other words, a system might systematically generate high confidence, regardless of accuracy, if it lacks mechanisms or fine-tuning for self-regulating uncertainty [<xref ref-type="bibr" rid="ref27">27</xref>]. Even larger models sometimes yield small or inconsistent confidence gaps, indicating that domain-specific refinements or improved calibration may be required. Without such refinements, confidence levels may remain weakly tied to actual reasoning processes, meaning that they would not reflect well-grounded internal assessments.</p><p>The implications for clinical practice warrant careful consideration. Although the performance leap of newer models is promising, their inability to accurately self-assess confidence across wrong answers poses risks. Two possible strategies for addressing these challenges can be the use of human-in-the-loop protocols and the implementation of ensemble methods [<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>Human-AI collaboration may offer a balanced approach to leveraging AI strengths while maintaining necessary human oversight in health care [<xref ref-type="bibr" rid="ref29">29</xref>]. Sezgin [<xref ref-type="bibr" rid="ref29">29</xref>] suggested a human-in-the-loop approach for ensuring that AI systems are supervised via human expertise. However, the effective implementation of this approach faces challenges. The careful design of user interfaces is important for preventing automation bias [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. There are also concerns about the potential erosion of clinical skills as a result of overreliance on AI [<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>Emerging evidence also suggests that some prompt engineering techniques can reduce but not completely eliminate sociodemographic bias in model outputs [<xref ref-type="bibr" rid="ref32">32</xref>]. However, studies continue to reveal significant sociodemographic biases in LLMs, such as a large-scale study by Omar et al [<xref ref-type="bibr" rid="ref33">33</xref>]. These biases may affect patient prioritization, treatment recommendations, and mental health screening across different groups, potentially driving disparities in care [<xref ref-type="bibr" rid="ref33">33</xref>]. Simply removing demographic variables (eg, gender and race) may also risk overlooking clinically relevant distinctions. In the context of our study, better-calibrated confidence outputs may help to mitigate such biases by allowing models to reliably signal uncertainty, which is especially important for sensitive medical decisions. Nonetheless, the comprehensive evaluation of these strategies requires longitudinal studies that monitor the evolution of biases and large-scale, globally diverse datasets, which can be used to refine mitigation approaches.</p><p>Ensemble methods, which aggregate multiple models, present another possible strategy [<xref ref-type="bibr" rid="ref34">34</xref>]. Mahajan et al [<xref ref-type="bibr" rid="ref35">35</xref>] conducted a review of ensemble learning techniques for disease prediction. They found that stacking&#x2014;an ensemble method that combines multiple classifiers&#x2014;showed the most accurate performance in 19 out of 23 cases. The voting approach was identified as the second-best ensemble method. However, ensemble methods are computationally intensive and may introduce latency in real-time clinical applications [<xref ref-type="bibr" rid="ref36">36</xref>]. In some scenarios, a slight increase in overall accuracy might justify extra processing time, yet in urgent applications (eg, emergency triage), even brief delays can be problematic. Ensemble methods aggregate outputs from multiple models, distributing the &#x201C;confidence load&#x201D; so that individual sources of skewed certainty are less influential. However, our findings suggest that many current models show miscalibrated confidence levels. If all component models in an ensemble are prone to the same calibration issues, combining them may amplify rather than correct erroneous certainty.</p><p>Both strategies&#x2014;human-in-the-loop protocols and the implementation of ensemble methods&#x2014;would require extensive clinical trials for validation and the development of model-specific calibration curves for each medical specialty.</p><p>Our study has several limitations. The dataset was limited to 1965 multiple-choice questions for 5 medical specialties; therefore, the dataset may not fully represent the breadth of clinical scenarios. Further, the combination of automatic rephrasing and manual validation could have introduced bias [<xref ref-type="bibr" rid="ref25">25</xref>]. We also used default model hyperparameters, which potentially limited performance optimization. To address these constraints, future work could expand the question sets (eg, by including a broader array of medical domains) and adopt real-world clinical data rather than purely examination-style questions. Additionally, custom hyperparameter tuning or advanced methods, such as RAG and fine-tuning, could be used to further refine model accuracy and confidence calibration [<xref ref-type="bibr" rid="ref37">37</xref>], as the use of default hyperparameters, which may have varied across the evaluated LLMs, could have influenced their reported confidence levels. Finally, investigating computational cost and the time efficiency of deploying these models in clinical workflows would help to clarify practical feasibility.</p><p>In conclusion, better-performing LLMs show more aligned overall confidence levels, yet even the most accurate models still display minimal variation between right and wrong answers. This highlights a limitation in current self-assessment mechanisms and calls for further research. Future efforts could include larger and more diverse clinical datasets, domain-specific calibration strategies, and real-world testing to refine confidence estimates. Such work is critical before broader implementation of LLMs in clinical settings.</p></sec></body><back><ack><p>We thank Dr Uriel Katz and the coauthors of the paper <italic>GPT versus Resident Physicians &#x2014; A Benchmark Based on Official Board Scores</italic> [<xref ref-type="bibr" rid="ref10">10</xref>] for sharing the multiple-choice question dataset.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">OBGYN</term><def><p>obstetrics and gynecology</p></def></def-item><def-item><term id="abb5">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clusmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kolbinger</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Muti</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>The future landscape of large language models in medicine</article-title><source>Commun Med (Lond)</source><year>2023</year><month>10</month><day>10</day><volume>3</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id><pub-id pub-id-type="medline">37816837</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tayebi Arasteh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lotfinia</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language models streamline automated machine learning for clinical studies</article-title><source>Nat Commun</source><year>2024</year><month>02</month><day>21</day><volume>15</volume><issue>1</issue><fpage>1603</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-45879-8</pub-id><pub-id pub-id-type="medline">38383555</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Entwistle</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pfeffer</surname><given-names>MA</given-names> </name></person-group><article-title>Creation and adoption of large language models in medicine</article-title><source>JAMA</source><year>2023</year><month>09</month><day>5</day><volume>330</volume><issue>9</issue><fpage>866</fpage><lpage>869</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.14217</pub-id><pub-id pub-id-type="medline">37548965</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Goodman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Patrinely</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the accuracy and reliability of AI-generated medical responses: an evaluation of the Chat-GPT model</article-title><source>Res Sq</source><comment>Preprint posted online on  Feb 28, 2023</comment><pub-id pub-id-type="doi">10.21203/rs.3.rs-2566942/v1</pub-id><pub-id pub-id-type="medline">36909565</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giannos</surname><given-names>P</given-names> </name></person-group><article-title>Evaluating the limits of AI in medical specialisation: ChatGPT&#x2019;s performance on the UK Neurology Specialty Certificate Examination</article-title><source>BMJ Neurol Open</source><year>2023</year><month>06</month><day>15</day><volume>5</volume><issue>1</issue><fpage>e000451</fpage><pub-id pub-id-type="doi">10.1136/bmjno-2023-000451</pub-id><pub-id pub-id-type="medline">37337531</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hoch</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Wollenberg</surname><given-names>B</given-names> </name><name name-style="western"><surname>L&#x00FC;ers</surname><given-names>JC</given-names> </name><etal/></person-group><article-title>ChatGPT&#x2019;s quiz skills in different otolaryngology subspecialties: an analysis of 2576 single-choice and multiple-choice board certification preparation questions</article-title><source>Eur Arch Otorhinolaryngol</source><year>2023</year><month>09</month><volume>280</volume><issue>9</issue><fpage>4271</fpage><lpage>4278</lpage><pub-id pub-id-type="doi">10.1007/s00405-023-08051-4</pub-id><pub-id pub-id-type="medline">37285018</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Katz</surname><given-names>U</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Shachar</surname><given-names>E</given-names> </name><etal/></person-group><article-title>GPT versus resident physicians &#x2014; a benchmark based on official board scores</article-title><source>NEJM AI</source><year>2024</year><month>04</month><day>12</day><volume>1</volume><issue>5</issue><fpage>AIdbp2300192</fpage><pub-id pub-id-type="doi">10.1056/AIdbp2300192</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nassar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hijaze</surname><given-names>K</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Generating credible referenced medical research: a comparative study of OpenAI&#x2019;s Gpt-4 and Google&#x2019;s Gemini</article-title><source>SSRN</source><comment>Preprint posted online on  Apr 2, 2024</comment><pub-id pub-id-type="doi">10.2139/ssrn.4780940</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azamfirei</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kudchadkar</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Fackler</surname><given-names>J</given-names> </name></person-group><article-title>Large language models and the perils of their hallucinations</article-title><source>Crit Care</source><year>2023</year><month>03</month><day>21</day><volume>27</volume><issue>1</issue><fpage>120</fpage><pub-id pub-id-type="doi">10.1186/s13054-023-04393-x</pub-id><pub-id pub-id-type="medline">36945051</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ye</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>J</given-names> </name></person-group><article-title>Unbox the black-box for the medical explainable AI via multi-modal and multi-centre data fusion: a mini-review, two showcases and beyond</article-title><source>Inf Fusion</source><year>2022</year><month>01</month><volume>77</volume><fpage>29</fpage><lpage>52</lpage><pub-id pub-id-type="doi">10.1016/j.inffus.2021.07.016</pub-id><pub-id pub-id-type="medline">34980946</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soroush</surname><given-names>A</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Zimlichman</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Large language models are poor medical coders &#x2014; benchmarking of medical code querying</article-title><source>NEJM AI</source><year>2024</year><month>04</month><day>19</day><volume>1</volume><issue>5</issue><fpage>AIdbp2300040</fpage><pub-id pub-id-type="doi">10.1056/AIdbp2300040</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schwartz</surname><given-names>IS</given-names> </name><name name-style="western"><surname>Link</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Daneshjou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cort&#x00E9;s-Penfield</surname><given-names>N</given-names> </name></person-group><article-title>Black box warning: large language models and the future of infectious diseases consultation</article-title><source>Clin Infect Dis</source><year>2024</year><month>04</month><day>10</day><volume>78</volume><issue>4</issue><fpage>860</fpage><lpage>866</lpage><pub-id pub-id-type="doi">10.1093/cid/ciad633</pub-id><pub-id pub-id-type="medline">37971399</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Poon</surname><given-names>AIF</given-names> </name><name name-style="western"><surname>Sung</surname><given-names>JJY</given-names> </name></person-group><article-title>Opening the black box of AI-medicine</article-title><source>J Gastroenterol Hepatol</source><year>2021</year><month>03</month><volume>36</volume><issue>3</issue><fpage>581</fpage><lpage>584</lpage><pub-id pub-id-type="doi">10.1111/jgh.15384</pub-id><pub-id pub-id-type="medline">33709609</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Xiong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Can LLMs express their uncertainty? An empirical evaluation of confidence elicitation in LLMs</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.13063</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarker</surname><given-names>IH</given-names> </name></person-group><article-title>Machine learning: algorithms, real-world applications and research directions</article-title><source>SN Comput Sci</source><year>2021</year><volume>2</volume><issue>3</issue><fpage>160</fpage><pub-id pub-id-type="doi">10.1007/s42979-021-00592-x</pub-id><pub-id pub-id-type="medline">33778771</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xiong</surname><given-names>G</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>A</given-names> </name></person-group><article-title>Improving retrieval-augmented generation in medicine with iterative follow-up questions</article-title><source>Pac Symp Biocomput</source><year>2025</year><volume>30</volume><fpage>199</fpage><lpage>214</lpage><pub-id pub-id-type="medline">39670371</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Townsend</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Beauchamp</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Evers</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Mattox</surname><given-names>KL</given-names> </name></person-group><source>Sabiston Textbook of Surgery: The Biological Basis of Modern Surgical Practice</source><year>2016</year><publisher-name>Elsevier Health Sciences</publisher-name><pub-id pub-id-type="other">9780323401630</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Loscalzo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fauci</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Kasper</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Hauser</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Longo</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Jameson</surname><given-names>JL</given-names> </name></person-group><source>Harrison&#x2019;s Principles of Internal Medicine</source><year>2022</year><access-date>2025-05-02</access-date><publisher-name>McGraw Hill</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cir.nii.ac.jp/crid/1130573781693502243">https://cir.nii.ac.jp/crid/1130573781693502243</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kliegman</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Behrman</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Jenson</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Stanton</surname><given-names>BMD</given-names> </name></person-group><source>Nelson Textbook of Pediatrics E-Book</source><year>2007</year><publisher-name>Elsevier Health Sciences</publisher-name></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><collab>American Psychiatric Association</collab></person-group><source>Diagnostic and Statistical Manual of Mental Disorders</source><year>2000</year><access-date>2025-05-02</access-date><publisher-name>American Psychiatric Publishing</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cir.nii.ac.jp/crid/1573950399819987840">https://cir.nii.ac.jp/crid/1573950399819987840</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Gabbe</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Niebyl</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Simpson</surname><given-names>JL</given-names> </name><etal/></person-group><source>Obstetrics: Normal and Problem Pregnancies E-Book</source><year>2016</year><publisher-name>Elsevier Health Sciences</publisher-name><pub-id pub-id-type="other">9781455733958</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soni</surname><given-names>S</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>K</given-names> </name></person-group><article-title>Paraphrasing to improve the performance of electronic health records question answering</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2020</year><month>05</month><day>30</day><volume>2020</volume><fpage>626</fpage><lpage>635</lpage><pub-id pub-id-type="medline">32477685</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 4, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>G</given-names> </name></person-group><article-title>Uncertainty estimation and quantification for llms: a simple supervised approach</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.15993</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Longhurst</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Chopra</surname><given-names>A</given-names> </name><name name-style="western"><surname>Atreja</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brownstein</surname><given-names>JS</given-names> </name></person-group><article-title>A call for artificial intelligence implementation science centers to evaluate clinical effectiveness</article-title><source>NEJM AI</source><year>2024</year><month>07</month><day>10</day><volume>1</volume><issue>8</issue><fpage>AIp2400223</fpage><pub-id pub-id-type="doi">10.1056/AIp2400223</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sezgin</surname><given-names>E</given-names> </name></person-group><article-title>Artificial intelligence in healthcare: complementing, not replacing, doctors and healthcare providers</article-title><source>Digit Health</source><year>2023</year><month>07</month><volume>9</volume><fpage>20552076231186520</fpage><pub-id pub-id-type="doi">10.1177/20552076231186520</pub-id><pub-id pub-id-type="medline">37426593</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Straw</surname><given-names>I</given-names> </name></person-group><article-title>The automation of bias in medical artificial intelligence (AI): decoding the past to create a better future</article-title><source>Artif Intell Med</source><year>2020</year><month>11</month><volume>110</volume><fpage>101965</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2020.101965</pub-id><pub-id pub-id-type="medline">33250145</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x010C;artolovni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Male&#x0161;evi&#x0107;</surname><given-names>A</given-names> </name><name name-style="western"><surname>Poslon</surname><given-names>L</given-names> </name></person-group><article-title>Critical analysis of the AI impact on the patient-physician relationship: a multi-stakeholder qualitative study</article-title><source>Digit Health</source><year>2023</year><month>12</month><day>19</day><volume>9</volume><fpage>20552076231220833</fpage><pub-id pub-id-type="doi">10.1177/20552076231220833</pub-id><pub-id pub-id-type="medline">38130798</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Agbareia</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluating and addressing demographic disparities in medical large language models: a systematic review</article-title><source>medRxiv</source><comment>Preprint posted online on  Sep 9, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.09.09.24313295</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Soffer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agbareia</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Socio-demographic biases in medical decision-making by large language models: a large-scale multi-model analysis</article-title><source>medRxiv</source><comment>Preprint posted online on  Oct 30, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.10.29.24316368</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name></person-group><article-title>One LLM is not enough: harnessing the power of ensemble learning for medical question answering</article-title><source>medRxiv</source><comment>Preprint posted online on  Dec 24, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.12.21.23300380</pub-id><pub-id pub-id-type="medline">38196648</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mahajan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Uddin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hajati</surname><given-names>F</given-names> </name><name name-style="western"><surname>Moni</surname><given-names>MA</given-names> </name></person-group><article-title>Ensemble learning for disease prediction: a review</article-title><source>Healthcare (Basel)</source><year>2023</year><month>06</month><day>20</day><volume>11</volume><issue>12</issue><fpage>1808</fpage><pub-id pub-id-type="doi">10.3390/healthcare11121808</pub-id><pub-id pub-id-type="medline">37372925</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Edeh</surname><given-names>MO</given-names> </name><name name-style="western"><surname>Dalal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dhaou</surname><given-names>IB</given-names> </name><etal/></person-group><article-title>Artificial intelligence-based ensemble learning model for prediction of hepatitis C disease</article-title><source>Front Public Health</source><year>2022</year><month>04</month><day>27</day><volume>10</volume><fpage>892371</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2022.892371</pub-id><pub-id pub-id-type="medline">35570979</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Timsina</surname><given-names>P</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Evaluating the accuracy of a state-of-the-art large language model for prediction of admissions from the emergency room</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1921</fpage><lpage>1928</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae103</pub-id><pub-id pub-id-type="medline">38771093</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary materials with further information on the benchmarked large language models, their performance across different fields and specialties, and the prompt used for rephrasing the questions.</p><media xlink:href="medinform_v13i1e66917_app1.docx" xlink:title="DOCX File, 36 KB"/></supplementary-material></app-group></back></article>