<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e87368</article-id><article-id pub-id-type="doi">10.2196/87368</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Improving Radiology Report Error Detection Using a Multipass Large Language Model: Framework Development and Validation</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Songsoo</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Seungtae</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>See Young</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Joonho</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kan</surname><given-names>Keechan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Hyunji</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Yoon</surname><given-names>Dukyong</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Radiology, Seoul National University Hospital</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff2"><institution>Department of Radiology, Gangnam Severance Hospital</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff3"><institution>Department of Internal Medicine, Gangnam Severance Hospital</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff4"><institution>Department of Neurology, Severance Hospital</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff5"><institution>Department of Surgery, Samsung Medical Center</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff6"><institution>Department of Obstetrics and Gynecology, Kangbuk Samsung Hospital</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff7"><institution>Department of Biomedical Systems Informatics, College of Medicine, Yonsei University</institution><addr-line>101-604</addr-line><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff8"><institution>Institute for Innovation in Digital Healthcare, Severance Hospital</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lei</surname><given-names>Jianbo</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Guo</surname><given-names>Jinyu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Dukyong Yoon, MD, PhD, Department of Biomedical Systems Informatics, College of Medicine, Yonsei University, 101-604, Seoul, 03687, Republic of Korea, 82 31-5189-8450, 82 31-5189-8450; <email>dukyong.yoon@yonsei.ac.kr</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>4</day><month>6</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e87368</elocation-id><history><date date-type="received"><day>10</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>10</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>21</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Songsoo Kim, Seungtae Lee, See Young Lee, Joonho Kim, Keechan Kan, Hyunji Lee, Dukyong Yoon. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 4.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e87368"/><abstract><sec><title>Background</title><p>Large language model (LLM) proofreaders for radiology reports generate many false positives (FPs) due to the low prevalence of errors.</p></sec><sec><title>Objective</title><p>This study aimed to determine whether an optimized LLM framework could improve both precision and cost-efficiency without compromising error detection capability.</p></sec><sec sec-type="methods"><title>Methods</title><p>In this retrospective study, 1000 radiology reports (radiography, ultrasonography, computed tomography, and magnetic resonance imaging; 250 each) were sampled from the Medical Information Mart for Intensive Care III database. Two public chest radiography corpora (CheXpert and Open-i) served as external test sets. Three LLM frameworks were evaluated: single-prompt detector (framework 1); report extractor plus single-prompt detector (framework 2); and extractor, detector, and FP verifier (framework 3). Precision for each framework was assessed using positive predictive value (PPV) and detected errors per 1000 reports. Overall efficiency was estimated using model inference costs and reviewer labor costs.</p></sec><sec sec-type="results"><title>Results</title><p>PPV increased from 0.063 (95% CI 0.036&#x2010;0.101) in framework 1 to 0.079 (95% CI 0.049&#x2010;0.118) in framework 2 and 0.159 (95% CI 0.090&#x2010;0.252) in framework 3 (<italic>P</italic>&#x003C;.001). Despite improved PPV, detected errors remained stable (detected errors per 1000 reports: 12&#x2010;14). Human review burden decreased from 192 to 88 reports. Framework 3 also reduced model inference costs to US $5.57 per 1000 reports (vs US $9.72 and US $6.85 for frameworks 1 and 2; 42.6% and 18.5% reductions, respectively). External validation confirmed similar improvements. Qualitative analysis revealed that remaining FPs in framework 3 were largely confined to cases requiring deep clinical context (clinically equivalent rephrasing: 53%; unsupported discrepancy assertions: 43%). By eliminating structural FPs (eg, section mismatches and lexical errors: 0%), the framework effectively shifted the quality assurance burden to a smaller set of ambiguous cases, enabling a targeted human-in-the-loop workflow.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The multipass LLM improved the precision and cost-efficiency of radiology report error detection in real-world, low-error prevalence settings. The framework demonstrates the feasibility of synergistic artificial intelligence&#x2013;radiologist collaboration and provides a cost-effective and scalable approach to artificial intelligence&#x2013;assisted quality assurance in both radiological practice and research.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>radiology report</kwd><kwd>quality assurance</kwd><kwd>error detection</kwd><kwd>human-in-the-loop</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Large language models (LLMs) are increasingly being explored as an additional set of eyes for proofreading radiology reports [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. However, when applied to real-world data, this extra &#x201C;eye&#x201D; often results in frequent false alarms. The precision of these models&#x2014;also referred to as positive predictive value (PPV)&#x2014;remains low because, despite &#x201C;good&#x201D; model specificity, the underlying error rate in clinical practice is extremely low. For example, in a setting with a 1% error prevalence, even a highly sensitive model with 90% specificity would still generate approximately 10 false alarms for every true error detected. In one experiment involving 10,000 real reports, GPT-4 achieved a PPV of only 6% despite good specificity, producing roughly 15 false alerts for each true error [<xref ref-type="bibr" rid="ref3">3</xref>]. These excessive notifications contribute to alert fatigue among radiologists, prompting them to ignore subsequent warnings, hindering effective human-artificial intelligence (AI) collaboration, and&#x2014;ironically&#x2014;increasing the real-world workload [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Although continued advances in LLMs are expected to address these shortcomings, the anticipated gains present a double-edged sword in terms of overall utility [<xref ref-type="bibr" rid="ref5">5</xref>]. Parameter scaling, task-specific fine-tuning, and deployment of multiagent systems [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>] can certainly enhance model performance and clinical efficiency. However, these improvements come at substantial computational costs. Deploying multiagent systems, for example, routinely produces execution traces averaging more than 15,000 lines per session [<xref ref-type="bibr" rid="ref8">8</xref>], while scaling to larger models dramatically increases resource demands&#x2014;a recent study showed that LLaMA-3-70B incurred over 400 times the inference time and cost of a lightweight 3B-parameter model for radiology report structuring [<xref ref-type="bibr" rid="ref9">9</xref>]. Consequently, AI-driven radiology report error detection faces a dual imperative: it must increase precision to reduce human workload while also remaining computationally feasible and cost-effective for routine clinical deployment.</p><p>Despite these limitations, previous studies still benchmark LLMs on error-inflated datasets and rarely explore strategies for improving PPV in low-error, real-world settings [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Similarly, strategies to improve operational cost-efficiency remain largely unexplored. Consequently, achieving clinical viability requires a framework capable of explicitly resolving the inherent trade-off between sensitivity and specificity observed in single-pass models [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>To address these gaps, we present a multipass LLM framework designed to optimize both precision and efficiency. The pipeline (1) employs a lightweight report extractor to isolate clinical findings from structural noise, (2) applies stepwise reasoning to decouple error detection from verification, thereby mitigating the sensitivity-specificity trade-off, and (3) provides a user interface to facilitate rapid review of the model&#x2019;s structured output by radiologists. A benchmark with 2 nonoptimized baselines was performed to quantify improvements in precision and efficiency.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study used only publicly available, deidentified radiology datasets (Medical Information Mart for Intensive Care III [MIMIC-III], CheXpert, and Open-i). Institutional Review Board approval and written informed consent were not required.</p></sec><sec id="s2-2"><title>Dataset Curation</title><p>Radiology reports were retrieved from the MIMIC-III database [<xref ref-type="bibr" rid="ref12">12</xref>]. Using the &#x201C;ISERROR&#x201D; column in the database, which identifies physician-flagged erroneous notes, the study included only those reports that had been confirmed as error-free. To validate robustness across the heterogeneous nature of radiology reports and facilitate performance comparison across modalities, modality-level stratified random sampling was performed to construct a balanced primary test set comprising 1000 reports, with 250 reports each from radiography, ultrasonography, computed tomography (CT), and magnetic resonance imaging (MRI). An additional hold-out set of 50 predominantly radiography reports was reserved for prompt tuning and reviewer calibration. To assess the external generalizability of the proposed pipeline, 2 publicly available radiology report datasets&#x2014;CheXpert and Open-i chest X-ray [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]&#x2014;were used as external test sets. The characteristics of the final reports across all datasets are summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of MIMIC-III<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, CheXpert, and Open-i radiology reports used in this study.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom" colspan="4">MIMIC-III</td><td align="left" valign="bottom">CheXpert (n=300)</td><td align="left" valign="bottom">Open-i (n=300)</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">X-ray (n=250)</td><td align="left" valign="bottom">Ultrasound (n=250)</td><td align="left" valign="bottom">CT<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (n=250)</td><td align="left" valign="bottom">MRI<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> (n=250)</td><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top">Characters, mean (SD)</td><td align="left" valign="top">1206.9 (367.6)</td><td align="left" valign="top">1419.6 (535.3)</td><td align="left" valign="top">2721.7 (1418.7)</td><td align="left" valign="top">2467.4 (1170.4)</td><td align="left" valign="top">525.9 (243.8)</td><td align="left" valign="top">334.7 (149.3)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Word count, mean (SD)</td><td align="left" valign="top">153.7 (53.2)</td><td align="left" valign="top">187.8 (78.2)</td><td align="left" valign="top">374.8 (208.7)</td><td align="left" valign="top">340.1 (170.5)</td><td align="left" valign="top">77.7 (37.1)</td><td align="left" valign="top">46.1 (22.6)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Sentence count, mean (SD)</td><td align="left" valign="top">28.4 (8)</td><td align="left" valign="top">32 (11.7)</td><td align="left" valign="top">59.6 (29.6)</td><td align="left" valign="top">52 (23.2)</td><td align="left" valign="top">13.9 (5.8)</td><td align="left" valign="top">10 (2.4)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">History section, n (%)</td><td align="left" valign="top">250 (100)</td><td align="left" valign="top">248 (99.2)</td><td align="left" valign="top">250 (100)</td><td align="left" valign="top">250 (100)</td><td align="left" valign="top">240 (80)</td><td align="left" valign="top">300 (100)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Technique section, n (%)</td><td align="left" valign="top">24 (9.6)</td><td align="left" valign="top">32 (12.8)</td><td align="left" valign="top">216 (86.4)</td><td align="left" valign="top">201 (80.4)</td><td align="left" valign="top">14 (4.7)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Comparison section, n (%)</td><td align="left" valign="top">76 (30.4)</td><td align="left" valign="top">114 (45.6)</td><td align="left" valign="top">135 (54)</td><td align="left" valign="top">89 (35.6)</td><td align="left" valign="top">284 (94.7)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>MIMIC-III: Medical Information Mart for Intensive Care III.</p></fn><fn id="table1fn2"><p><sup>b</sup><italic>P</italic> values are from Kruskal-Wallis test (continuous variables) and Fisher exact test (categorical variables).</p></fn><fn id="table1fn3"><p><sup>c</sup>CT: computed tomography.</p></fn><fn id="table1fn4"><p><sup>d</sup>MRI: magnetic resonance imaging. </p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Error Definition</title><p>The process of generating radiology reports can be divided into the following two main steps: (1) detecting abnormalities from images and (2) documenting the detected abnormalities [<xref ref-type="bibr" rid="ref3">3</xref>]. Efforts have been made to use natural language processing models, including LLMs, to correct errors occurring in the second step [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Examples of these second-step errors may result from the misinterpretation of findings or the inclusion of factually inconsistent content in the report text. Following the classification proposed by Kim et al [<xref ref-type="bibr" rid="ref3">3</xref>], errors were categorized into interpretive errors (addition, omission, and substitution) and factual errors (discrepancy in location/numerical measurement). The detailed description of error types is described in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-4"><title>Proposed Framework and Experimental Design</title><p>Three LLM pipelines were compared (<xref ref-type="fig" rid="figure1">Figure 1</xref>). In framework 1, the original report was input directly into an advanced LLM, which performed both error detection and false-positive (FP) verification within a single prompt. In framework 2, a lightweight LLM first extracted and structured the relevant portion of the radiology report by removing content outside the &#x201C;Findings&#x201D; and &#x201C;Impression&#x201D; sections&#x2014;such as clinical information, technique notes, and headers&#x2014;and seamlessly merging any addenda into this section. The resulting structured Findings or Impression block was then passed to an advanced LLM, which performed combined error detection and FP verification in a single prompt. Framework 3 retained the preliminary extraction step but divided the downstream reasoning across 2 successive prompts: candidate errors were first enumerated and then reexamined to verify potential FPs.</p><p>Final model responses were structured to include the radiology report, identified errors, and corresponding error reasoning [<xref ref-type="bibr" rid="ref18">18</xref>]. The resulting outputs were streamed to a web-based quality assurance interface, which displayed the flagged report alongside the model&#x2019;s error reasoning, allowing human reviewers to accept or reject each suggestion with a single click (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><p>The lightweight LLM used in this experiment was executed by OpenAI&#x2019;s GPT-4.1-nano, selected for its favorable cost-effectiveness, and the advanced LLMs were O3, chosen for their superior reasoning performance at the time of the study [<xref ref-type="bibr" rid="ref19">19</xref>]. The 2 models have a 100-fold difference in cost per token. All pipelines were executed on the institution&#x2019;s private Azure OpenAI Service, with each LLM API (application programming interface) call launched within an isolated API session. Detailed descriptions of the prompts and parameters are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Experimental design of large language model (LLM) pipelines for radiology report error detection. In the single-pass framework (A), each report is processed once by an advanced LLM that simultaneously performs error detection and false-positive (FP) verification before reader&#x2019;s review. In the 2-pass framework (B), a lightweight LLM first performs preprocessing, and an advanced LLM subsequently conducts combined detection and verification before reader&#x2019;s review. In the proposed 3-pass framework (C), preprocessing is followed by error detection in a second pass and isolated FP verification in a third pass by an advanced LLM, prior to reader&#x2019;s review.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87368_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>User interface for the multipass large language model (LLM) radiology report error detector. The review screen loads the preprocessed structured output, displaying the &#x201C;Findings&#x201D; and &#x201C;Impression&#x201D; sections in the left panel, while the right panel shows the detected error and the model-provided rationale using the error and error_reason keys. This structured layout enables reviewers to classify each finding as either a true positive (TP) or a false positive (FP) with a single click.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87368_fig02.png"/></fig></sec><sec id="s2-5"><title>Precision Evaluation</title><p>Each flagged report underwent a 2-step review. Two board-certified physicians (with 9 and 10 years of clinical experience, respectively) screened each model-generated alert against the original report using a standardized rubric aligned with our error taxonomy, labeling them as true positive (TP) or FP. Subsequently, 2 board-certified radiologists (with 8 and 9 years of clinical experience, respectively) adjudicated these labels to establish the final ground truth. Reviewer calibration on the held-out prompt-tuning set (n=50) showed an overall percent agreement of 94% (47/50). The performance of the framework was evaluated using PPV (PPV=TP/[TP+FP]) and the detected errors per 1000 reports (DE/1k=(TP/N)&#x00D7;1000), where N denotes the size of the test set. Here, TP refers to a model-flagged report in which a genuine error was confirmed, while FP refers to a flagged report that did not contain a true error. To analyze failure modes, 30 adjudicated FP alerts per framework were randomly sampled from the MIMIC-III test set (n=90). Each case was independently reviewed and classified into a 6-category taxonomy by 2 board-certified radiologists to track the evolution of error patterns.</p></sec><sec id="s2-6"><title>Operational Cost-Efficiency Evaluation</title><p>A cost-minimization analysis was conducted under the assumption of equal true error detection across all 3 frameworks. The estimated running cost was defined as the sum of (1) model inference costs and (2) reviewer labor costs [<xref ref-type="bibr" rid="ref20">20</xref>]. Because the exact computational cost of the closed-source LLM could not be measured directly, we used per-token API charges as a proxy measure. This choice is supported by the evidence that electricity and graphics processing unit rental costs dominate token pricing in commercial LLMs [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Consequently, the model inference costs were calculated based on text volume and provider pricing rates (Eq &#x202F;S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Reviewer labor cost was approximated by multiplying the total number of reports sent for manual inspection&#x2014;comprising both TPs and FPs&#x2014;by the mean compensation paid per report (Eq&#x202F;S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Reviewer labor costs were modeled using the median annual compensation for diagnostic radiologists (US $568,327) reported in the 2024 Medical Group Management Association (MGMA) compensation survey [<xref ref-type="bibr" rid="ref23">23</xref>]. Assuming a standard 2,000-hour work year, this corresponds to an hourly rate of approximately US $284 or US $4.74 per minute. Consistent with review durations reported in prior literature [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref3">3</xref>], the analysis was performed by varying the review time per flagged report (30, 60, and 120 s) to estimate labor costs across different clinical scenarios.</p><p>The estimated running cost for each framework was therefore defined as the sum of the 2 components (Eq S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and is reported separately to permit direct operational cost-effectiveness comparisons. Formal derivations and the full set of symbols are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>Continuous variables are reported as mean (SD) when normally distributed (Shapiro-Wilk test, <italic>P</italic>&#x003E;.05) and as median (IQR) otherwise. Categorical variables are summarized as counts and percentages. Between-dataset differences were assessed using the Kruskal&#x2013;Wallis test for continuous variables and the Fisher exact test for categorical variables. PPV and DE/1k are expressed with 2-sided 95% exact (Clopper-Pearson) CIs [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>For PPV comparisons, pairwise differences among the 3 frameworks were assessed using report-level paired-cluster bootstrap (10,000 replicates). Two-sided <italic>P</italic> values were extracted from the bootstrap distributions, and the family-wise error rate across the 3 comparisons was controlled using the Holm-Bonferroni procedure [<xref ref-type="bibr" rid="ref25">25</xref>]. Modality-specific PPV analyses were regarded as exploratory and reported without multiplicity adjustment. When the frameworks followed a prespecified ordinal sequence, a Cochran-Armitage trend test was additionally applied to detect monotonic trends in PPV across the ordered groups.</p><p>For DE/1k comparisons, within-case differences among the 3 frameworks were evaluated using the exact McNemar test, with the family-wise error rate controlled via the Holm-Bonferroni procedure. When comparing 3 or more frameworks, an overall Cochran Q test was conducted; if significant, pairwise McNemar tests with Holm correction were performed. All tests were 2-tailed, with an &#x03B1; of .05.</p><p>The sample size was calculated based on the MIMIC-III dataset. The baseline PPV of the reference pipeline was assumed to be 6%, as previously reported [<xref ref-type="bibr" rid="ref3">3</xref>]. A 2-fold improvement with the proposed pipeline was deemed a meaningful difference. Treating the comparison as a 2-sided test of the difference between 2 independent proportions and adopting <italic>&#x03B1;</italic>=.05 with a statistical power of 80%, a minimum of 716 reports was required. Consequently, the final sample of 1000 reports satisfied and exceeded this requirement, thereby ensuring adequate power for the primary hypothesis test. Analyses were performed in Python 3.11 using pandas 2.2.2, SciPy 1.12.0, and statsmodels 0.15.0 for statistical procedures and matplotlib 3.9.0 for visualization.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Detected Errors and FP Cases</title><p>The true errors identified in each dataset are summarized in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Fourteen errors were detected in the MIMIC-III dataset&#x2014;2 in chest radiographs, 3 in carotid ultrasonography studies, 1 in a neonatal brain ultrasonography study, 3 in head CT scans, 2 in chest CT scans, and 3 in head MRI examinations (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Two errors were found in both the CheXpert and Open-i datasets. The error distribution included discrepancies in anatomical location (9/18, 50%), omission (3/18, 17%), addition (3/18, 17%), and discrepancies in numeric measure (3/18, 17%); notably, errors detected by each framework followed a strict subset relationship, and the per-framework breakdown is provided in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p><xref ref-type="table" rid="table2">Table 2</xref> summarizes representative FP cases. FPs that occurred only in framework&#x202F;1 arose chiefly from a rigid comparison of superficial header elements&#x2014;such as date strings or minor omissions in the clinical history&#x2014;with the body text, causing spurious contradiction flags. Once the header metadata had been removed, such false flags were not observed in framework&#x202F;2 or&#x202F;3. Framework&#x202F;2 still produced many FPs because it compared sentences at the strict word level, with little regard for anatomical or contextual nuance. Framework 3 subsequently reexamined the candidate contradictions identified by framework 2 in the context of the full report and reclassified statements deemed acceptable in routine practice, thereby reducing the overall FP burden.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Flowchart of radiology report sampling from the Medical Information Mart for Intensive Care III (MIMIC-III), CheXpert, and Open-i datasets for prompt tuning and test set construction. CT: computed tomography; MRI: magnetic resonance imaging; US: ultrasound.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87368_fig03.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Representative cases from the analysis of false positives across 3 different frameworks<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Report&#x202F;excerpt</td><td align="left" valign="bottom">Framework 1</td><td align="left" valign="bottom">Framework 2</td><td align="left" valign="bottom">Framework 3</td><td align="left" valign="bottom">False-positive rationale</td></tr></thead><tbody><tr><td align="left" valign="top">Header: &#x201C;Comparison: 10/20.&#x201D;<break/>&#x201C;1. Compared to prior study from October 5th, 20, interval increase in...&#x201D;&#x2002;</td><td align="left" valign="top">Error</td><td align="left" valign="top">No error</td><td align="left" valign="top">No error</td><td align="left" valign="top">Two legitimate comparison dates were interpreted as contradictory.</td></tr><tr><td align="left" valign="top">Header: &#x201C;s/p MVC<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>, s/p chest removal&#x201D;<break/>&#x201C;status post MVC and chest tube removal, "</td><td align="left" valign="top">Error</td><td align="left" valign="top">No error</td><td align="left" valign="top">No error</td><td align="left" valign="top">Typographic omission of &#x201C;tube&#x201D; was mistaken for a clinical conflict.</td></tr><tr><td align="left" valign="top">&#x201C;&#x2026;image degradation in the low pelvis because of patient&#x2019;s size, but no masses or fluid collections are seen.&#x201D;<break/>&#x201C;osteolytic and mixed osteosclerotic metastases are seen in the pelvic bones, most prominent at the right iliac&#x2026;&#x201D;</td><td align="left" valign="top">Error</td><td align="left" valign="top">Error</td><td align="left" valign="top">No error</td><td align="left" valign="top">Separate reporting of the pelvic cavity and pelvic bone was overlooked, and the statements were therefore flagged as contradictory.</td></tr><tr><td align="left" valign="top">Chest section: &#x201C;The heart, pericardium, and great vessels are normal.&#x201D;<break/>Abdomen section: &#x201C;The IVC<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> is markedly compressed; however, remains patent.&#x201D;</td><td align="left" valign="top">Error</td><td align="left" valign="top">Error</td><td align="left" valign="top">No error</td><td align="left" valign="top">Separate reporting of chest and abdomen was overlooked, and the statements were therefore flagged as contradictory.</td></tr><tr><td align="left" valign="top">&#x201C;The liver demonstrates normal morphology without signal dropout...There are numerous ill-defined lesions within the liver which are hypointense to the liver parenchyma...&#x201D;</td><td align="left" valign="top">Error</td><td align="left" valign="top">Error</td><td align="left" valign="top">No error</td><td align="left" valign="top">Failure to distinguish overall morphology from focal lesions produced a false positive.</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>All reports contained no actual errors. &#x201C;Error&#x201D; indicates false positive by framework; &#x201C;No error&#x201D; indicates correct assessment by framework.</p></fn><fn id="table2fn2"><p><sup>b</sup>MVC: motor vehicle collision.</p></fn><fn id="table2fn3"><p><sup>c</sup>IVC: inferior vena cava.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Precision of LLM Frameworks</title><p>The precision of the LLM frameworks improved as the pipeline complexity increased (<xref ref-type="table" rid="table3">Table 3</xref>, <xref ref-type="fig" rid="figure4">Figure 4A</xref>). In framework 3, the overall PPV was 0.159 (95% CI 0.090&#x2010;0.252), compared with 0.079 in framework 2 and 0.063 in framework 1. The superiority of framework 3 over both framework 1 and framework 2 remained significant after multiple comparison correction (all paired-cluster bootstrap <italic>P</italic>&#x003C;.001; all Holm-adjusted <italic>P</italic>&#x003C;.001). A prespecified Cochran-Armitage trend test confirmed a significant upward trend in PPV across the 3 ordered frameworks (<italic>P</italic>=.02), indicating that successive refinements effectively reduced FP alerts.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Positive predictive value (PPV) among 3 error detection frameworks across MIMIC-III<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>, CheXpert, and Open-i datasets.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset, modality, and framework</td><td align="left" valign="bottom">TP<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">FP<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="bottom">PPV (95% CI)</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="bottom">Holm-adjusted <italic>P</italic> value<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="bottom">Cochran-Armitage trend test <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">MIMIC-III&#x2003;</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Overall</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">12</td><td align="left" valign="top">179</td><td align="left" valign="top">0.063 (0.033&#x2010;0.107)</td><td align="left" valign="top">.01</td><td align="left" valign="top">.01</td><td align="left" valign="top">.10</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">13</td><td align="left" valign="top">151</td><td align="left" valign="top">0.079 (0.043&#x2010;0.132)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">14</td><td align="left" valign="top">74</td><td align="left" valign="top">0.159 (0.090&#x2010;0.252)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>X-ray</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">2</td><td align="left" valign="top">17</td><td align="left" valign="top">0.105 (0.013&#x2010;0.331)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">.52</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">2</td><td align="left" valign="top">16</td><td align="left" valign="top">0.111 (0.014&#x2010;0.347)</td><td align="left" valign="top">.29</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">2</td><td align="left" valign="top">8</td><td align="left" valign="top">0.200 (0.025&#x2010;0.556)</td><td align="left" valign="top">.29</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ultrasound</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">3</td><td align="left" valign="top">27</td><td align="left" valign="top">0.100 (0.021&#x2010;0.265)</td><td align="left" valign="top">.42</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">.27</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">3</td><td align="left" valign="top">23</td><td align="left" valign="top">0.115 (0.024&#x2010;0.302)</td><td align="left" valign="top">.04</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">4</td><td align="left" valign="top">14</td><td align="left" valign="top">0.222 (0.064&#x2010;0.476)</td><td align="left" valign="top">.03</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CT<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">4</td><td align="left" valign="top">85</td><td align="left" valign="top">0.045 (0.012&#x2010;0.111)</td><td align="left" valign="top">.02</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">.09</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">5</td><td align="left" valign="top">64</td><td align="left" valign="top">0.072 (0.024&#x2010;0.161)</td><td align="left" valign="top">.01</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">5</td><td align="left" valign="top">30</td><td align="left" valign="top">0.143 (0.048&#x2010;0.303)</td><td align="left" valign="top">.01</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MRI<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">3</td><td align="left" valign="top">50</td><td align="left" valign="top">0.057 (0.012&#x2010;0.157)</td><td align="left" valign="top">.89</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">.39</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">3</td><td align="left" valign="top">48</td><td align="left" valign="top">0.059 (0.012&#x2010;0.162)</td><td align="left" valign="top">.09</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">3</td><td align="left" valign="top">22</td><td align="left" valign="top">0.120 (0.025&#x2010;0.312)</td><td align="left" valign="top">.10</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7">CheXpert</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">2</td><td align="left" valign="top">25</td><td align="left" valign="top">0.074 (0.009&#x2010;0.243)</td><td align="left" valign="top">.46</td><td align="left" valign="top">.79</td><td align="left" valign="top">.55</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">2</td><td align="left" valign="top">19</td><td align="left" valign="top">0.095 (0.012&#x2010;0.304)</td><td align="left" valign="top">.39</td><td align="left" valign="top">.79</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">2</td><td align="left" valign="top">13</td><td align="left" valign="top">0.133 (0.017&#x2010;0.405)</td><td align="left" valign="top">.26</td><td align="left" valign="top">.79</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="7">Open-i</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">2</td><td align="left" valign="top">22</td><td align="left" valign="top">0.083 (0.010&#x2010;0.270)</td><td align="left" valign="top">.27</td><td align="left" valign="top">.80</td><td align="left" valign="top">.84</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">2</td><td align="left" valign="top">41</td><td align="left" valign="top">0.047 (0.006&#x2010;0.158)</td><td align="left" valign="top">.25</td><td align="left" valign="top">.80</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">2</td><td align="left" valign="top">17</td><td align="left" valign="top">0.105 (0.013&#x2010;0.331)</td><td align="left" valign="top">.57</td><td align="left" valign="top">.80</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>MIMIC-III: Medical Information Mart for Intensive Care III.</p></fn><fn id="table3fn2"><p><sup>b</sup>TP: true positive.</p></fn><fn id="table3fn3"><p><sup>c</sup>FP: false positive.</p></fn><fn id="table3fn4"><p><sup>d</sup>Two-sided paired-cluster bootstrap (10,000 replicates) <italic>P</italic> value&#x2014;this row compares current framework with the next (row 1: framework 1 vs framework 2; row 2: framework 2 vs framework 3; row 3: framework 1 vs framework 3).</p></fn><fn id="table3fn5"><p><sup>e</sup>Same comparisons as above.</p></fn><fn id="table3fn6"><p><sup>f</sup>Not applicable.</p></fn><fn id="table3fn7"><p><sup>g</sup>CT: computed tomography.</p></fn><fn id="table3fn8"><p><sup>h</sup>MRI: magnetic resonance imaging.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Performance comparison of the 3 error detection frameworks across the Medical Information Mart for Intensive Care III (MIMIC-III), CheXpert, and Open-i datasets. (A) Positive predictive value. (B) Detected errors per 1000 reports. Statistical significance was determined by the paired bootstrap test: <italic>P</italic>&#x003C;.05, *<italic>P</italic>&#x003C;.01, ***<italic>P</italic>&#x003C;.001, ****<italic>P</italic>&#x003C;.0001. Trend analysis was performed using the Cochran-Armitage test: &#x2020;<italic>P</italic>&#x003C;.05. CT: computed tomography; MRI: magnetic resonance imaging; US: ultrasound.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87368_fig04.png"/></fig><p>The observed increase in precision was not accompanied by a reduction in TP detections (<xref ref-type="table" rid="table4">Table 4</xref>, <xref ref-type="fig" rid="figure4">Figure 4B</xref>). The overall DE/1k was 14 (95% CI 8&#x2010;23) for framework 3, compared with 13 (95% CI 7&#x2010;22) for framework 2 and 12 (95% CI 6&#x2010;21) for framework 1. None of the pairwise comparisons reached statistical significance (all <italic>P</italic>&#x2265;.84), indicating that framework 3 reduced FP flags without compromising error detection.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Detected errors per 1000 radiology reports among 3 error detection frameworks across MIMIC-III<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>, CheXpert, and Open-i datasets.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset, modality, and framework</td><td align="left" valign="bottom">Detected errors per 1000 (95% CI)</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="bottom">Holm-adjusted <italic>P</italic> value<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="bottom">Cochran Q test <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">MIMIC-III</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Overall</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">12 (6&#x2010;21)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">.93</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">13 (7&#x2010;22)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">14 (8&#x2010;23)</td><td align="left" valign="top">.85</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>X-ray</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">8 (1-29)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">8 (1-29)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">8 (1-29)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ultrasound</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">12 (2&#x2010;35)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">.91</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">12 (2&#x2010;35)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">16 (4&#x2010;40)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CT<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">16 (4&#x2010;40)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">.93</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">20 (7&#x2010;46)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">20 (7&#x2010;46)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>MRI<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">12 (2&#x2010;35)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">12 (2&#x2010;35)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">12 (2&#x2010;35)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5">CheXpert</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">7 (1-24)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">7 (1-24)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">7 (1-24)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5">Open-i</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">7 (1-24)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">7 (1-24)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">7 (1-24)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>MIMIC-III: Medical Information Mart for Intensive Care III.</p></fn><fn id="table4fn2"><p><sup>b</sup>McNemar test <italic>P</italic> value&#x2014;this row compares current framework with the next (row 1: framework 1 vs framework 2; row 2: framework 2 vs framework 3; row 3: framework 1 vs framework 3).</p></fn><fn id="table4fn3"><p><sup>c</sup>Same comparisons as above.</p></fn><fn id="table4fn4"><p><sup>d</sup>Not applicable.</p></fn><fn id="table4fn5"><p><sup>e</sup>CT: computed tomography.</p></fn><fn id="table4fn6"><p><sup>f</sup>MRI: magnetic resonance imaging.</p></fn></table-wrap-foot></table-wrap><p>In the CheXpert and Open-i datasets, framework 3 achieved the highest PPVs (0.133 and 0.105, respectively; paired-cluster bootstrap <italic>P</italic>&#x2265;.26; Holm-adjusted <italic>P</italic>&#x2265;.79) and maintained identical DE/1k across frameworks (7 for both datasets; all <italic>P</italic>&#x003E;.99), demonstrating robustness across diverse datasets. However, in the Open-i dataset, framework 2 yielded a lower PPV than framework 1&#x2014;the only instance in which PPV did not increase monotonically with pipeline complexity. This exception may be due to the Open-i dataset already being extensively preprocessed, reducing the relative benefit of the first-pass LLM.</p><p>When framework 3 was executed using the o4-mini model instead of o3, the overall PPV significantly declined to 0.081 (<italic>P</italic>&#x003C;.001; Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), while the DE/1k decreased slightly to 12 without reaching statistical significance (<italic>P</italic>=.69; Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-3"><title>Analysis of FP Patterns</title><p>The distribution of FPs shifted distinctly toward semantic categories as the pipeline evolved (<xref ref-type="table" rid="table5">Table 5</xref>, Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Superficial errors were effectively filtered by preprocessing, with header/metadata artifact cases decreasing from 3 out of 30 (10%) in framework 1 to 0% (0/30) in framework 2. However, rare preprocessing-induced artifact cases (1/30, 3%) emerged as a minor trade-off. Subsequently, structural mismatches were suppressed by the verifier step, resulting in the elimination of section/scope mismatch and lexical/abbreviation/typographical mismatch (0/30, 0%) in framework 3. Consequently, residual FPs in the final framework were predominantly concentrated in complex semantic categories, specifically clinically equivalent rephrasing (16/30, 53%) and unsupported discrepancy assertions (13/30, 43%).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Characterization of false-positive categories across frameworks.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">False-positive category</td><td align="left" valign="bottom">Definition</td><td align="left" valign="bottom">Framework 1 (n=30), n (%)</td><td align="left" valign="bottom">Framework 2 (n=30), n (%)</td><td align="left" valign="bottom">Framework 3 (n=30), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Header/metadata artifact</td><td align="left" valign="top">Header/history/technique/comparison text is treated as a body-text discrepancy.</td><td align="left" valign="top">3 (10)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">Section/scope mismatch</td><td align="left" valign="top">Statements from different sections or anatomical scopes (eg, chest vs abdomen) are compared as if the same scope.</td><td align="left" valign="top">2 (7)</td><td align="left" valign="top">1 (3)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">Lexical/abbreviation/typographical<break/>mismatch</td><td align="left" valign="top">Minor lexical differences (abbreviations, spelling, and formatting) are flagged as discrepancies.</td><td align="left" valign="top">2 (7)</td><td align="left" valign="top">1 (3)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">Clinically equivalent rephrasing</td><td align="left" valign="top">Clinically acceptable wording is rewritten to a &#x201C;preferred&#x201D; term and the original is flagged as discrepant.</td><td align="left" valign="top">12 (40)</td><td align="left" valign="top">14 (47)</td><td align="left" valign="top">16 (53)</td></tr><tr><td align="left" valign="top">Unsupported discrepancy assertions</td><td align="left" valign="top">A discrepancy is asserted despite insufficient support (eg, wrong matching or compatible statements treated as conflict).</td><td align="left" valign="top">11 (37)</td><td align="left" valign="top">13 (43)</td><td align="left" valign="top">13 (43)</td></tr><tr><td align="left" valign="top">Preprocessing-induced artifact</td><td align="left" valign="top">Preprocessing (segmentation/normalization/removal) introduces artificial discrepancies.</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (3)</td><td align="left" valign="top">1 (3)</td></tr></tbody></table></table-wrap></sec><sec id="s3-4"><title>Operational Cost-Efficiency of LLM Frameworks</title><p>The token counts for each pass are summarized in Table S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Framework 3 achieved the lowest model inference cost, at US $5.57 per 1000 reports, compared with US $9.72 and US $6.85 for frameworks 1 and 2, respectively&#x2014;corresponding to cost reductions of approximately 42.6% and 18.5% relative to frameworks 1 and 2, respectively (<xref ref-type="fig" rid="figure5">Figure 5A</xref>). Framework 2 achieved most of its savings through token reduction via preprocessing, relative to framework 1. In framework 3, additional savings beyond those of framework 2 were primarily attributed to the FP verifier being triggered for only 88 candidate errors, rather than for all cases.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Cost analysis of the radiology report error detection frameworks and their component passes. (A) Model-only inference cost per 1000 reports in the Medical Information Mart for Intensive Care III (MIMIC-III) dataset. (B) Corresponding inference cost for 300 reports in the CheXpert and Open-i datasets. (C) Estimated total running cost per 1000 MIMIC-III reports plotted against reviewer labor cost per report. The analysis considers labor costs of US $2.37, US $4.74, and US $9.47, corresponding to review times of 30, 60, and 120 seconds, respectively. Annotations indicate the projected savings of framework 3 compared to framework 1.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87368_fig05.png"/></fig><p>Based on the MGMA-derived rate (US $4.74/min), the estimated reviewer labor costs per report were US $2.37 (30 s), US $4.74 (60 s), and US $9.47 (120 s). Under these assumptions, framework 3 demonstrated a consistent advantage in reviewer labor cost. Compared to framework 1, framework 3 yielded estimated savings of US $251 (30-s scenario), US $497 (60-s scenario), and US $989 (120-s scenario) per 1000 reports in estimated running costs (<xref ref-type="fig" rid="figure5">Figure 5C</xref>).</p><p>A similar trend was observed in the CheXpert and Open-i datasets (<xref ref-type="fig" rid="figure5">Figure 5B</xref>), where framework 3 consistently demonstrated the lowest model inference costs per 300 reports (US $0.1374 and US $0.1271, respectively), compared with framework 1 (US $1.8943 and US $1.5930, respectively). However, in the Open-i dataset, framework 2 incurred a higher model inference cost than framework 1&#x2014;representing the only instance in which model inference cost did not decrease with increased pipeline complexity.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>The proposed 3-pass LLM framework improved precision and reduced estimated running costs without compromising error detection capability on datasets that approximate real-world error prevalence. On the MIMIC dataset, a PPV of 16% was achieved&#x2014;more than twice that of a single-prompt, single-extraction baseline&#x2014;while maintaining the detected error counts. The model inference cost decreased by 42.7% (US&#x202F;$9.72 vs&#x202F;US $5.57 per 1000 reports), and the number of alerts requiring human review declined by 54.2% (192 vs 88). These improvements remained robust across 2 independent datasets and within modality-specific subgroups.</p><p>Widely adopted clinical decision support systems, such as sepsis prediction or drug interaction alerts, typically exhibit low PPV in real-world settings because the cost of missing a critical event is unacceptable [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. Radiology report error detection shares inherent challenges due to the low prevalence of errors in routine practice. To address these challenges, previous studies have often relied on synthetic error injection, based on the assumption that prevalence does not influence the sensitivity or specificity of the model [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. However, this approach has notable limitations. Specifically, synthetic error injection may introduce bias in performance evaluation and error characterization, as the distribution and nature of injected errors may not accurately reflect real-world conditions. Furthermore, artificially inflating error prevalence can substantially overestimate the PPV, thereby misrepresenting the practical utility of the model in real-world scenarios. A low PPV&#x2014;implying a high rate of false alarms&#x2014;can increase the workload for radiologists and introduce potential biases for researchers conducting quality assurance on curated datasets; furthermore, it often induces distrust in the system, leading to &#x201C;alert fatigue,&#x201D; where alarms are habitually ignored [<xref ref-type="bibr" rid="ref29">29</xref>]. Kim et al [<xref ref-type="bibr" rid="ref3">3</xref>] demonstrated that few-shot prompting could improve GPT-4&#x2019;s PPV to 0.12 on a dataset without injected errors. However, this improvement was derived from a post hoc analysis that reprompted only those cases previously identified as FPs, limiting the generalizability of the findings.</p><p>Thus, the proposed multipass architecture improves precision in real-world settings. This improvement is driven by 2 key components. First, a preprocessing LLM transforms raw radiology reports into cleaned, structured output before passing them to the primary LLM. During prompt tuning, we frequently observed that artifacts&#x2014;such as embedded metadata, addenda, and page breaks&#x2014;were misinterpreted as report content, thereby inflating FP rates. The preprocessor mitigates this issue by removing such noise, which not only reduces the likelihood of FPs but also decreases the input size for downstream tasks. However, these preprocessing prompts yielded minimal benefits on the already cleaned Open-i dataset. The FP analysis even identified a small number of cases where the preprocessing itself generated artifacts. Consequently, to achieve optimal performance, preprocessing strategies must be carefully adapted to the reporting conventions and dataset characteristics unique to each institution.</p><p>Second, the framework uses a detector-verifier cascade. When the detector is prone to FPs, separating detection and verification into 2 distinct steps allows the LLMs to complement each other: the detector prioritizes sensitivity, whereas the verifier enhances specificity. This arrangement parallels the tiered double-reading workflow commonly used in radiology; however, in this framework, the 2 LLMs perform the initial &#x201C;double read,&#x201D; and a human radiologist provides the final adjudication&#x2014;effectively constituting a tiered triple read [<xref ref-type="bibr" rid="ref30">30</xref>]. Prior evidence supports the benefits of task separation: in one study, 2 GPT-4 prompts for radiology report error detection were compared, revealing a trade-off between sensitivity and specificity, while the overall <italic>F</italic><sub>1</sub>-score remained constant [<xref ref-type="bibr" rid="ref3">3</xref>]. This suggests that, for error detection&#x2014;where high sensitivity is essential&#x2014;a 2-stage cascade that first maximizes sensitivity and then applies a highly specific verifier offers a more effective balance between error detection and alert fatigue.</p><p>The remaining FPs in this framework are largely confined to cases requiring deeper clinical context, highlighting an inherent limitation of LLMs in adjudicating nuanced clinical equivalence. This may partly explain why X-ray/ultrasound achieved relatively higher PPV than CT/MRI, as CT/MRI reports are typically longer and contain more complex, multifinding narratives. Importantly, this FP profile motivates a human-in-the-loop quality assurance design: AI can triage potential errors to shift the workflow from an &#x201C;unguided&#x201D; to a &#x201C;targeted search,&#x201D; while a secondary verifier layer filters out many structural FPs. Consequently, clinicians can focus their expertise on the smaller set of clinically ambiguous alerts that require high-level judgment, suggesting a complementary division of labor between AI speed and human expertise.</p><p>Successful clinical translation requires workflow-tailored integration. This framework is envisioned as an asynchronous background service that analyzes draft reports after initial dictation and surfaces only clinically meaningful report internal inconsistencies before final sign-off. To optimize the radiologist&#x2019;s workload, notification timing should be adapted to the clinical context&#x2014;for example, near-real-time alerts for emergency or intensive care unit studies, notifications before discharge for routine inpatient studies, and batched alerts before the next scheduled visit for outpatient studies. Initial deployment should be focused on high-acuity settings or predefined high-risk cohorts to maximize clinical benefits while minimizing alert fatigue; accumulated adjudication outcomes can then be leveraged for institution-specific refinement. To reduce the cognitive burden associated with alert review, it is essential that the error rationale be displayed alongside the flagged discrepancy, as implemented in the present framework. The seamless integration of these elements into the Picture Archiving and Communication System reading environment is equally critical to ensure that adjudication occurs within the radiologist&#x2019;s existing workflow.</p><p>This study has some limitations. First, the cost analysis focused on estimated running costs (inference and labor) to allow for a direct comparison across frameworks. A comprehensive total cost of ownership analysis was not performed, as such an evaluation would require site-specific microcosting of integration, governance, and maintenance expenses. Additionally, because direct measurements of the power consumption of the closed-source model were not feasible, we used a token-processing charge as a surrogate. This approach was chosen to comparatively evaluate the superiority between frameworks, and actual measurements were beyond the scope of this study. The cost model also assumes a fixed per-alert review time; in practice, framework 3&#x2019;s residual alerts&#x2014;predominantly semantically complex cases, such as clinically equivalent rephrasing&#x2014;may require longer adjudication than the structural artifacts filtered by earlier stages, potentially moderating the estimated labor savings. Future studies should aim to validate both actual computational usage and per-alert adjudication time in real-world deployment scenarios. Second, although the PPV doubled, the framework still generates an excessive number of alerts for a busy clinical workflow. In this study, typographical errors, along with all error candidates that could not be confirmed using the corresponding images, were conservatively classified as FPs; therefore, the reported PPV likely represents a lower bound. Even with this conservative estimate, the current precision remains insufficient for fully autonomous AI adoption. Many FPs resulted from the framework interpreting individual words too strictly, indicating a limitation in its ability to interpret clinical context effectively. Third, although the framework was validated on multiple datasets, real-world radiology reports vary substantially across institutions regarding templates, headers, and dictation styles, which may affect preprocessing reliability and shift downstream precision. Fourth, while the pipeline is architecturally modular, reported performance was obtained using specific proprietary models and may not directly translate to other architectures, such as open-source LLMs. Fifth, although the intended role is human-in-the-loop decision support rather than an autonomous agent, deployment entails ethical and legal considerations&#x2014;including liability for missed errors and the risk of automation bias&#x2014;necessitating strict oversight.</p><p>Future studies would greatly benefit from evaluating additional backbones, including locally fine-tuned LLMs and multimodal models that incorporate image context to broaden detectable error types. Additionally, quantifying end-to-end computational costs and incorporating institution-aware adaptation to mitigate heterogeneity in reporting styles in real deployment settings would be beneficial. Prospective evaluation in high-stakes, error-prone settings is warranted to validate practical utility and safety under human oversight.</p><p>In conclusion, the multipass LLM improved the precision and efficiency of radiology-report error detection in real-world, low-error prevalence settings. The framework demonstrates the feasibility of synergistic AI-radiologist collaboration and provides a cost-effective and scalable approach to AI-assisted quality assurance in both radiological practice and research.</p></sec></body><back><ack><p>Generative artificial intelligence (AI) models were explicitly used as the subjects of investigation for the core methodology of this research, specifically for isolating clinical findings and performing error detection and verification within radiology reports. No generative AI tools were used in the writing, drafting, or editing of this manuscript itself. The authors remain fully responsible for the accuracy, originality, and integrity of all content in the manuscript, including all references and citations.</p></ack><notes><sec><title>Funding</title><p>This study obtained funding from MD-PhD/Medical Scientist Training Program through the Korea Health Industry Development Institute (KHIDI), funded by the Ministry of Health &#x0026; Welfare, Republic of Korea, and the Korea Health Technology R&#x0026;D Project through KHIDI, funded by the Ministry of Health &#x0026; Welfare, Republic of Korea (grant number RS-2022-KH125153).</p></sec><sec><title>Data Availability</title><p>The datasets analyzed during this study are publicly available via PhysioNet (Medical Information Mart for Intensive Care III [MIMIC-III]) [<xref ref-type="bibr" rid="ref31">31</xref>], the Stanford ML Group (CheXpert) [<xref ref-type="bibr" rid="ref32">32</xref>], and the US National Library of Medicine (Open-i) [<xref ref-type="bibr" rid="ref33">33</xref>]. Access to some datasets may require registration and acceptance of data-use terms.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: SK, DY</p><p>Methodology: SK, DY</p><p>Investigation: SL, SYL, JK, KK, HL</p><p>Writing &#x2013; original draft: SK</p><p>Writing &#x2013; review &#x0026; editing: SK, SL, SYL, JK, KK, HL, DY</p><p>Supervision: DY</p><p>Funding acquisition: SK, DY</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb4">DE/1k</term><def><p>detected errors per 1000 reports</p></def></def-item><def-item><term id="abb5">FP</term><def><p>false positive</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">MGMA</term><def><p>Medical Group Management Association</p></def></def-item><def-item><term id="abb8">MIMIC-III</term><def><p>Medical Information Mart for Intensive Care III</p></def></def-item><def-item><term id="abb9">MRI</term><def><p>magnetic resonance imaging</p></def></def-item><def-item><term id="abb10">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb11">TP</term><def><p>true positive</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gertz</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Dratsch</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bunck</surname><given-names>AC</given-names> </name><etal/></person-group><article-title>Potential of GPT-4 for detecting errors in radiology reports: implications for reporting accuracy</article-title><source>Radiology</source><year>2024</year><month>04</month><volume>311</volume><issue>1</issue><fpage>e232714</fpage><pub-id pub-id-type="doi">10.1148/radiol.232714</pub-id><pub-id pub-id-type="medline">38625012</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Forman</surname><given-names>HP</given-names> </name></person-group><article-title>Large language models as an inexpensive and effective extra set of eyes in radiology reporting</article-title><source>Radiology</source><year>2024</year><month>04</month><volume>311</volume><issue>1</issue><fpage>e240844</fpage><pub-id pub-id-type="doi">10.1148/radiol.240844</pub-id><pub-id pub-id-type="medline">38625009</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>HJ</given-names> </name><etal/></person-group><article-title>Large-scale validation of the feasibility of GPT-4 as a proofreading tool for head CT reports</article-title><source>Radiology</source><year>2025</year><month>01</month><volume>314</volume><issue>1</issue><fpage>e240701</fpage><pub-id pub-id-type="doi">10.1148/radiol.240701</pub-id><pub-id pub-id-type="medline">39873601</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Philpotts</surname><given-names>LE</given-names> </name></person-group><article-title>Advancing artificial intelligence to meet breast imaging needs</article-title><source>Radiology</source><year>2022</year><month>04</month><volume>303</volume><issue>1</issue><fpage>78</fpage><lpage>79</lpage><pub-id pub-id-type="doi">10.1148/radiol.213101</pub-id><pub-id pub-id-type="medline">35040680</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Doo</surname><given-names>FX</given-names> </name><name name-style="western"><surname>Vosshenrich</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>TS</given-names> </name><etal/></person-group><article-title>Environmental sustainability and AI in radiology: a double-edged sword</article-title><source>Radiology</source><year>2024</year><month>02</month><volume>310</volume><issue>2</issue><fpage>e232030</fpage><pub-id pub-id-type="doi">10.1148/radiol.232030</pub-id><pub-id pub-id-type="medline">38411520</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>H</given-names> </name><name name-style="western"><surname>You</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Enhancing diagnostic capability with multi-agents conversational large language models</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>13</day><volume>8</volume><issue>1</issue><fpage>159</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01550-0</pub-id><pub-id pub-id-type="medline">40082662</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Medical foundation large language models for comprehensive text analysis and beyond</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>5</day><volume>8</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01533-1</pub-id><pub-id pub-id-type="medline">40044845</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cemri</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>MZ</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Why do multi-agent LLM systems fail?</article-title><access-date>2026-05-09</access-date><conf-name>39th Conference on Neural Information Processing Systems (NeurIPS 2025) Track on Datasets and Benchmarks</conf-name><conf-date>Dec 2-7, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf?id=fAjbYBmonr">https://openreview.net/pdf?id=fAjbYBmonr</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Moll</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fay</surname><given-names>L</given-names> </name><name name-style="western"><surname>Azhar</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Structuring radiology reports: challenging LLMs with lightweight models</article-title><conf-name>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 4-9, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.emnlp-main.392</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salam</surname><given-names>B</given-names> </name><name name-style="western"><surname>St&#x00FC;we</surname><given-names>C</given-names> </name><name name-style="western"><surname>Nowak</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Large language models for error detection in radiology reports: a comparative analysis between closed-source and privacy-compliant open-source models</article-title><source>Eur Radiol</source><year>2025</year><month>08</month><volume>35</volume><issue>8</issue><fpage>4549</fpage><lpage>4557</lpage><pub-id pub-id-type="doi">10.1007/s00330-025-11438-y</pub-id><pub-id pub-id-type="medline">39979623</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>C</given-names> </name><name name-style="western"><surname>Teichman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Generative large language models trained for detecting errors in radiology reports</article-title><source>Radiology</source><year>2025</year><month>05</month><volume>315</volume><issue>2</issue><fpage>e242575</fpage><pub-id pub-id-type="doi">10.1148/radiol.242575</pub-id><pub-id pub-id-type="medline">40392090</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-III, a freely accessible critical care database</article-title><source>Sci Data</source><year>2016</year><month>05</month><day>24</day><volume>3</volume><issue>1</issue><fpage>160035</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id><pub-id pub-id-type="medline">27219127</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Irvin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>M</given-names> </name><etal/></person-group><article-title>CheXpert: a large chest radiograph dataset with uncertainty labels and expert comparison</article-title><conf-name>Proceedings of the 33rd AAAI Conference on Artificial Intelligence (AAAI 2019)</conf-name><conf-date>Jan 27 to Feb 1, 2019</conf-date><pub-id pub-id-type="doi">10.1609/aaai.v33i01.3301590</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kohli</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Rosenman</surname><given-names>MB</given-names> </name><etal/></person-group><article-title>Preparing a collection of radiology examinations for distribution and retrieval</article-title><source>J Am Med Inform Assoc</source><year>2016</year><month>03</month><volume>23</volume><issue>2</issue><fpage>304</fpage><lpage>310</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocv080</pub-id><pub-id pub-id-type="medline">26133894</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zech</surname><given-names>J</given-names> </name><name name-style="western"><surname>Forde</surname><given-names>J</given-names> </name><name name-style="western"><surname>Titano</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Kaji</surname><given-names>D</given-names> </name><name name-style="western"><surname>Costa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Oermann</surname><given-names>EK</given-names> </name></person-group><article-title>Detecting insertion, substitution, and deletion errors in radiology reports using neural sequence-to-sequence models</article-title><source>Ann Transl Med</source><year>2019</year><month>06</month><volume>7</volume><issue>11</issue><fpage>233</fpage><pub-id pub-id-type="doi">10.21037/atm.2018.08.11</pub-id><pub-id pub-id-type="medline">31317003</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Min</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Park</surname><given-names>CM</given-names> </name></person-group><article-title>RRED: a radiology report error detector based on deep learning framework</article-title><conf-name>Proceedings of the 4th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jul 14, 2022</conf-date><conf-loc>Seattle, Washington, USA</conf-loc><fpage>41</fpage><lpage>52</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.clinicalnlp-1.5</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chaudhari</surname><given-names>GR</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>TL</given-names> </name><etal/></person-group><article-title>Application of a domain-specific BERT for detection of speech recognition errors in radiology reports</article-title><source>Radiol Artif Intell</source><year>2022</year><month>07</month><volume>4</volume><issue>4</issue><fpage>e210185</fpage><pub-id pub-id-type="doi">10.1148/ryai.210185</pub-id><pub-id pub-id-type="medline">35923373</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Structured model outputs</article-title><source>OpenAI Developers</source><access-date>2025-08-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://developers.openai.com/api/docs/guides/structured-outputs">https://developers.openai.com/api/docs/guides/structured-outputs</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Models</article-title><source>OpenAI Developers</source><access-date>2025-06-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://developers.openai.com/api/docs/models">https://developers.openai.com/api/docs/models</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Briggs</surname><given-names>AH</given-names> </name><name name-style="western"><surname>O&#x2019;Brien</surname><given-names>BJ</given-names> </name></person-group><article-title>The death of cost-minimization analysis?</article-title><source>Health Econ</source><year>2001</year><month>03</month><volume>10</volume><issue>2</issue><fpage>179</fpage><lpage>184</lpage><pub-id pub-id-type="doi">10.1002/hec.584</pub-id><pub-id pub-id-type="medline">11252048</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Krupp</surname><given-names>L</given-names> </name><name name-style="western"><surname>Gei&#x00DF;ler</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lukowicz</surname><given-names>P</given-names> </name><name name-style="western"><surname>Karolus</surname><given-names>J</given-names> </name></person-group><article-title>Towards sustainable web agents: a plea for transparency and dedicated metrics for energy consumption</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 25, 2025</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2502.17903</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Strubell</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ganesh</surname><given-names>A</given-names> </name><name name-style="western"><surname>McCallum</surname><given-names>A</given-names> </name></person-group><article-title>Energy and policy considerations for deep learning in NLP</article-title><conf-name>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 28 to Aug 2, 2019</conf-date><pub-id pub-id-type="doi">10.18653/v1/P19-1355</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Darves</surname><given-names>B</given-names> </name></person-group><article-title>Physician specialty compensation trends: salaries on the rise, but increases mostly modest</article-title><source>NEJM CareerCenter Resources</source><year>2025</year><access-date>2026-05-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://resources.nejmcareercenter.org/article/physician-specialty-compensation-trends-salaries-on-the-rise-but-increases-mostly-modest/">https://resources.nejmcareercenter.org/article/physician-specialty-compensation-trends-salaries-on-the-rise-but-increases-mostly-modest/</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clopper</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Pearson</surname><given-names>ES</given-names> </name></person-group><article-title>The use of confidence or fiducial limits illustrated in the case of the binomial</article-title><source>Biometrika</source><year>1934</year><volume>26</volume><issue>4</issue><fpage>404</fpage><lpage>413</lpage><pub-id pub-id-type="doi">10.1093/biomet/26.4.404</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cameron</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Gelbach</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>DL</given-names> </name></person-group><article-title>Bootstrap-based improvements for inference with clustered errors</article-title><source>Rev Econ Stat</source><year>2008</year><month>08</month><volume>90</volume><issue>3</issue><fpage>414</fpage><lpage>427</lpage><pub-id pub-id-type="doi">10.1162/rest.90.3.414</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wong</surname><given-names>A</given-names> </name><name name-style="western"><surname>Otles</surname><given-names>E</given-names> </name><name name-style="western"><surname>Donnelly</surname><given-names>JP</given-names> </name><etal/></person-group><article-title>External validation of a widely implemented proprietary sepsis prediction model in hospitalized patients</article-title><source>JAMA Intern Med</source><year>2021</year><month>08</month><day>1</day><volume>181</volume><issue>8</issue><fpage>1065</fpage><lpage>1070</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2021.2626</pub-id><pub-id pub-id-type="medline">34152373</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ostermayer</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Braunheim</surname><given-names>B</given-names> </name><name name-style="western"><surname>Mehta</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Ward</surname><given-names>J</given-names> </name><name name-style="western"><surname>Andrabi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sirajuddin</surname><given-names>AM</given-names> </name></person-group><article-title>External validation of the Epic sepsis predictive model in 2 county emergency departments</article-title><source>JAMIA Open</source><year>2024</year><month>12</month><volume>7</volume><issue>4</issue><fpage>ooae133</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooae133</pub-id><pub-id pub-id-type="medline">39545248</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wasylewicz</surname><given-names>ATM</given-names> </name><name name-style="western"><surname>van de Burgt</surname><given-names>BWM</given-names> </name><name name-style="western"><surname>Manten</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Contextualized drug-drug interaction management improves clinical utility compared with basic drug-drug interaction management in hospitalized patients</article-title><source>Clin Pharmacol Ther</source><year>2022</year><month>08</month><volume>112</volume><issue>2</issue><fpage>382</fpage><lpage>390</lpage><pub-id pub-id-type="doi">10.1002/cpt.2624</pub-id><pub-id pub-id-type="medline">35486411</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Felisberto</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lima</surname><given-names>GDS</given-names> </name><name name-style="western"><surname>Celuppi</surname><given-names>IC</given-names> </name><etal/></person-group><article-title>Override rate of drug-drug interaction alerts in clinical decision support systems: a brief systematic review and meta-analysis</article-title><source>Health Informatics J</source><year>2024</year><volume>30</volume><issue>2</issue><fpage>14604582241263242</fpage><pub-id pub-id-type="doi">10.1177/14604582241263242</pub-id><pub-id pub-id-type="medline">38899788</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Suri</surname><given-names>A</given-names> </name></person-group><article-title>AI as a second reader can reduce radiologists&#x2019; workload and increase accuracy in screening mammography</article-title><source>Radiol Artif Intell</source><year>2024</year><month>11</month><volume>6</volume><issue>6</issue><fpage>e240624</fpage><pub-id pub-id-type="doi">10.1148/ryai.240624</pub-id><pub-id pub-id-type="medline">39441106</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>MIMIC-III clinical database</article-title><source>PhysioNet</source><access-date>2026-05-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://physionet.org/content/mimiciii/1.4/">https://physionet.org/content/mimiciii/1.4/</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>CheXpert: a large chest radiograph dataset</article-title><source>Stanford ML Group</source><access-date>2026-05-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://stanfordmlgroup.github.io/competitions/chexpert/">https://stanfordmlgroup.github.io/competitions/chexpert/</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>Open-i: open access biomedical image search engine</article-title><source>National Library of Medicine</source><access-date>2026-05-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openi.nlm.nih.gov/">https://openi.nlm.nih.gov/</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary methodology and materials, including detailed large language model prompts, extended cost-efficiency derivations, and supplementary performance tables and figures.</p><media xlink:href="medinform_v14i1e87368_app1.docx" xlink:title="DOCX File, 4634 KB"/></supplementary-material></app-group></back></article>