<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e90374</article-id><article-id pub-id-type="doi">10.2196/90374</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Expert Evaluation and Consensus on GPT-4o Summaries of Clinical Letters: Validation and Results of the Framework and Implementation of AI Tools Project</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Deschepper</surname><given-names>Mieke</given-names></name><degrees>MSc, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rogge</surname><given-names>Helga</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Colpaert</surname><given-names>Kirsten</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Data Science Institute, Ghent University Hospital</institution><addr-line>Corneel Heymanslaan 10</addr-line><addr-line>Ghent</addr-line><addr-line>Flanders</addr-line><country>Belgium</country></aff><aff id="aff2"><institution>Department of Internal Medicine and Pediatrics, Ghent University</institution><addr-line>Ghent</addr-line><addr-line>Flanders</addr-line><country>Belgium</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Jung</surname><given-names>Kwang Yul</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lai</surname><given-names>Xiangxun</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mieke Deschepper, MSc, PhD, Data Science Institute, Ghent University Hospital, Corneel Heymanslaan 10, Ghent, Flanders, 9000, Belgium, +32 9 3321814; <email>mieke.deschepper@uzgent.be</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>11</day><month>5</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e90374</elocation-id><history><date date-type="received"><day>29</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>17</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>17</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Mieke Deschepper, Helga Rogge, Kirsten Colpaert. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 11.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e90374"/><abstract><sec><title>Background</title><p>Large language models (LLMs) are increasingly used to summarize clinical documents; yet, automated metrics often inadequately capture clinical relevance and safety. In the initial phase of the &#x201C;Framework and Implementation of AI Tools,&#x201D; an expert-driven, cocreated evaluation methodology was established to assess LLM-generated discharge letter summaries, combining prompt design considerations with intuitive expert appraisal.</p></sec><sec><title>Objective</title><p>This study aimed to quantify expert agreement and interrater reliability on LLM summaries of discharge letters, identify frequent and clinically relevant errors, and evaluate practical implications for integrating LLMs into documentation workflows.</p></sec><sec sec-type="methods"><title>Methods</title><p>Thirty expert-curated synthetic Dutch discharge letters were summarized. Thirty-one clinicians from Flemish care settings (1 university hospital, 2 private hospitals, and 2 general practice circles) evaluated the summaries. The evaluation framework consisted of 61 binary layout items assessing whether required sections and formatting were correctly present, 33 content items (correct or complete vs incorrect, subcategorizing missing, irrelevant, and hallucinated information), a 4-point global quality rating, and an open comment. Statistical analyses included descriptive statistics, mixed effects ordinal regression on the global score, consensus (agreement per question or letter) percentages, interrater reliability (Cohen &#x03BA;, intraclass correlation coefficient [ICC], Fleiss &#x03BA;, and prevalence index), and thematic synthesis of comments.</p></sec><sec sec-type="results"><title>Results</title><p>Layout adherence was high (88%), especially in the conclusion section. The positive response rate for content was overall moderate (78%), with the best performance observed in the medical history section and the lowest performance observed in the medication section, which also showed the highest rate of hallucinations and the weakest interrater consensus. Across all sections, missing information was the most common error. Nearly 70% of global ratings were &#x201C;good&#x201D; or &#x201C;very good.&#x201D; Higher positive response rates for content predicted better global scores (&#x03B2;=.079; <italic>P</italic>&#x003C;.001), while layout and participant specialty were not relevant to global scoring. Consensus was high for the layout questions (median 96.8%, IQR 90.2%-100%) and somewhat lower for content (median 83.9%, IQR 67.7-96.8), with the lowest agreement in the medication section. Interrater agreement was moderate (median Cohen &#x03BA;=0.36, IQR 0.29-0.43; range 0.07&#x2010;0.56), but overall reliability was high (ICC 0.945, 95% CI 0.942-0.948), indicating strong consistency at the global level despite interrater variability. The prevalence index demonstrated that high ICC values were partly driven by the strong prevalence of affirmative responses in layout items, while content items showed more balanced distributions and lower agreement.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our framework offers a robust approach for evaluating LLM-generated discharge summaries, balancing usability and clinical relevance. Semantic integrity, especially regarding medication details, was identified as a key vulnerability. Perceived overall quality was driven by a positive response rate for content. High ICC values for global score, with lower item-level agreement lead toward the need for clearer, context-specific prompts and standardized evaluation criteria to reduce interrater variability. Human oversight and targeted automated checks for omissions and hallucinations are essential for safe clinical deployment.</p></sec></abstract><kwd-group><kwd>large language model</kwd><kwd>LLM</kwd><kwd>clinical letters</kwd><kwd>expert agreement</kwd><kwd>interrater reliability</kwd><kwd>hallucinations</kwd><kwd>omissions</kwd><kwd>consensus</kwd><kwd>natural language processing</kwd><kwd>artificial intelligence</kwd><kwd>health care AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The evaluation of large language models (LLMs) in health care has predominantly centered on accuracy metrics; yet, only a small fraction of studies (6.45%) specifically address the evaluation of medical LLMs [<xref ref-type="bibr" rid="ref1">1</xref>]. Recognizing the limitations of this narrow focus, the &#x201C;Framework and Implementation of AI Tools&#x201D; (FRAIT) project [<xref ref-type="bibr" rid="ref2">2</xref>] established a collaborative, cocreative workflow with hospital providers to develop a more comprehensive evaluation approach. Building on the structured planning principles outlined by Tam et al [<xref ref-type="bibr" rid="ref3">3</xref>], the FRAIT project emphasized the importance of preparatory phases before human evaluation.</p><p>Unlike prior studies that often relied on general or model-centric questions [<xref ref-type="bibr" rid="ref4">4</xref>], our approach prioritized trustworthiness and a detailed examination of individual sections within medical summaries. A key innovation was the separation of evaluation criteria into &#x201C;layout&#x201D; (the presence and structure of required sections) and &#x201C;content&#x201D; (the accuracy and completeness of clinical information). This distinction acknowledges that while layout can be objectively assessed, content evaluation is inherently more complex and nuanced.</p><p>Recent literature [<xref ref-type="bibr" rid="ref5">5</xref>] underscores the necessity of using real-world patient care data to ensure that LLM evaluations reflect actual clinical conditions. There is a growing consensus on the need for standardized frameworks that define evaluation tasks and dimensions, address specialty-specific gaps, and mitigate potential biases. Additionally, comprehensive cost-benefit analyses and transparent reporting of AI system failures are essential for the responsible integration of LLMs into clinical workflows.</p><p>Traditional automated metrics such as ROUGE and BLEU, although widely used, have been shown to correlate poorly with expert human assessments in clinical contexts [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. These metrics often overlook critical aspects such as semantic accuracy, clinical relevance, and contextual appropriateness. Consequently, expert-driven, consensus-based evaluation frameworks are increasingly advocated.</p><p>LLM-powered summarization tools hold significant promise for distilling essential information from lengthy discharge letters and medical records, thereby supporting clinicians in making timely, informed decisions [<xref ref-type="bibr" rid="ref8">8</xref>]. Reliable summarization methods can enhance workflow efficiency, reduce cognitive burden, and improve care quality. However, the adoption of LLMs in clinical documentation also raises concerns regarding accuracy, safety, and interpretability.</p><p>To address these challenges, the FRAIT project systematically evaluated LLM-generated summaries of expert-curated synthetic letters using a diverse panel of health care professionals. An intuitive evaluation tool was developed and applied to medical discharge letters, enabling a thorough analysis of both structural and semantic aspects.</p><p>The aim of this study was to systematically evaluate the reliability, accuracy, and usability of LLM-generated summaries of clinical discharge letters using the consensus-based FRAIT framework. By engaging a multidisciplinary panel and using a structured evaluation tool, we sought to (1) quantify expert agreement on summary quality, (2) identify the most frequent and clinically relevant errors, and (3) assess the practical implications of integrating LLMs into clinical documentation workflows.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>We followed the QUEST (Quality of Information, Understanding and Reasoning, Expression Style and Persona, Safety and Harm, and Trust and Confidence) human evaluation framework [<xref ref-type="bibr" rid="ref3">3</xref>] to set up our method in Appendix 1.1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><sec id="s2-1"><title>Participants</title><p>A total of 33 clinicians (including 2 nonphysicians) were recruited from 3 hospitals (University Hospital Ghent, AZ Sint-Lucas Ghent, and AZ Oudenaarde) and 2 general practitioner circles (Huisartsenkring Schelde en Leie and Huisartsenvereniging Ghent). Two physicians did not complete the evaluation due to technical login issues (n=1) or time constraints (n=1). Half of the group (n=16, 51.6%) were hospital-based specialists, followed by general practitioners (n=13, 42%), with the remainder consisting of other health care professionals (n=2, 6.5%; <xref ref-type="table" rid="table1">Table 1</xref>). All participants evaluated 30 summaries from the respective medical discharge letters. More details on the number of years of clinical practice for each participant type can be found in Appendix 1.2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview of organization and participants.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Type of organization (n=31) and name</td><td align="left" valign="top">Values, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Hospital (n=19, 61.29%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AZ Oudenaarde</td><td align="left" valign="top">5 (16.13)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AZ Sint-Lucas Ghent</td><td align="left" valign="top">6 (19.35)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ghent University Hospital</td><td align="left" valign="top">8 (25.81)</td></tr><tr><td align="left" valign="top">GP<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> circles (n=12, 38.71%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Huisartsenvereniging Ghent</td><td align="left" valign="top">10 (32.26)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Huisartsenkring Schelde en Leie</td><td align="left" valign="top">2 (6.45)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>GP: general practitioner.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2"><title>Workshop Design</title><p>A workshop on user interface and user experience design was conducted on October 23, 2024, at Ghent University Hospital. The primary objective was to explore how an evaluation tool for LLM-generated summaries should be structured from a physician&#x2019;s perspective. A total of 26 participants attended, representing all participating organizations. Feedback collected during the workshop was synthesized and shared with technical implementers to finalize the design of the evaluation tool. The user interface was tested by 2 individuals, one data scientist (MD) and one physician (HR). A subsequent test session involved 5 additional technical participants and 3 physicians. These sessions focused on usability, clarity, and alignment with clinical workflows.</p></sec><sec id="s2-3"><title>Evaluation Timeline</title><p>The evaluation phase timeline is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>. A total duration of 6 weeks was allocated for the completion of the evaluation. From April 1 to April 30, 2025, initial testing of the evaluation tool was conducted by 2 physicians and 1 data scientist. A pilot demonstration was held for the Data Science Institute from Ghent University Hospital on April 29, 2025. Following feedback from this session, a slide-based walkthrough was developed to clarify procedural steps prior to demonstrating the tool. An online presentation for (local) coordinators at participating institutions took place on May 5, 2025; participants received a manual with screenshots and keyboard shortcuts. All coordinators evaluated the tool over a week and reported no major issues.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Evaluation timeline on the Framework and Implementation of AI Tools (FRAIT) project. Starting from April 1, 2025, with the initial testing until the demonstration of the tool at the FRAIT symposium on October 22, 2025. FAQ: frequently asked question.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e90374_fig01.png"/></fig><p>A broader online demonstration for all participants occurred on May 15, 2025, and was announced via newsletter and email. This session included login instructions for the secure platform and a comprehensive walkthrough of the evaluation tool. For those unable to attend, a recording of the session was made available. On May 19, 2025, a follow-up feedback session addressed questions, which were subsequently compiled into a frequently asked questions document distributed to all users. The formal evaluation phase spanned from May 15 to June 15, 2025, with an extended period continuing until June 30, 2025. Subsequently, a brief demonstration was presented by a general practitioner during the FRAIT symposium on October 22, 2025.</p><p>Several educational resources were provided for physicians, including an extensive manual with sections such as examples of potential questions, a live demonstration with an available recording, and a dedicated question-and-answer session. Additionally, during the evaluation phase, support was offered via email and telephone during working hours.</p></sec><sec id="s2-4"><title>Input for the Evaluation Tool</title><p>The evaluation was conducted using a standardized prompt derived from the FRAIT questionnaire [<xref ref-type="bibr" rid="ref2">2</xref>]. This prompt was designed to capture essential clinical and structural elements of discharge summaries across predefined categories: <italic>general, medical history, investigations, hospital course, medication, follow-up,</italic> and <italic>conclusion</italic>. An overview of the number of questions per category is provided in Appendix 2.1 <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec><sec id="s2-5"><title>Data and Materials</title><p>Thirty expert-curated synthetic Dutch medical discharge letters were generated according to the protocol outlined in Appendix 3 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. For the purpose of this study, we used the term synthetic clinical letters to describe synthetically modified clinical documents derived from real patient records that were systematically transformed to prevent reidentification while preserving clinical structure and plausibility. This process included complete removal of all direct and indirect personal identifiers, temporal shifting of dates, and targeted modification of clinical content (eg, laterality, diagnoses not relevant to the clinical scenario, and physiological and laboratory parameters) where deemed necessary. To minimize the risk of medical logic drift introduced by shifting dates, all synthetic letters were manually generated by 1 physician and independently reviewed by a senior physician to ensure that clinical decisions, treatments, and guideline-based care remained historically plausible despite the temporal adjustments. The resulting letters do not correspond to any real individual but retain realistic clinical narratives suitable for evaluation purposes. All data were processed to eliminate any risk of patient reidentification, and the approach was reviewed and approved by the relevant institutional committees (reference ONZ-2024&#x2010;0273 and ONZ-2024&#x2010;0304). These letters served as the source material for summarization and subsequent evaluation. The LLM used for this task was GPT-4o, configured with a temperature setting of 0 to minimize interpretative variability. In this phase, one single prompt was used to be able to evaluate the same results.</p></sec><sec id="s2-6"><title>Evaluation Framework</title><p>The evaluation questions were organized into 4 categories (<xref ref-type="fig" rid="figure2">Figure 2</xref>). Layout questions (n=61) were binary and assessed whether the specific sections requested in the prompt were present in the generated summary (for instance, evaluators were asked: <italic>&#x201C;Show the General section in the summary. Is the requested section included?&#x201D;</italic>). Additionally, these questions assessed the format in which sections should be presented, such as bullet points or plain text. Content questions (n=33) examined the correctness and completeness of individual items. Responses could be <italic>yes</italic> or <italic>no</italic>, with the latter requiring further classification as <italic>incomplete</italic>, <italic>too much or irrelevant information</italic>, or <italic>incorrect information or hallucination</italic>. When answering <italic>no</italic>, evaluators were required to indicate the erroneous content in either the source or the summary. An example question was <italic>&#x201C;Show the Admission Date. Is the content correct? If not, indicate what is incorrect.&#x201D;</italic> In addition, the evaluation framework comprised 4 distinct categories of items: layout, content, global scoring, and an open question. Layout items assessed compliance with prompt-specified structural requirements and were scored as binary (yes or no). For each layout item, participants were able to view the original prompt and asked to evaluate whether the LLM-generated summary adhered exactly to the requested formatting and structure (eg, presence of specific section headings, use of bold formatting, bullet lists with a maximum number of items, or mandated section titles such as &#x201C;medication&#x201D;). Strict adherence was required for layout items to reflect real-world prompt-based use of LLMs in clinical documentation workflows.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Overview of the 96 evaluation questions: layout, content, global score, and open question. A full overview of all questions can be found in Appendix 2.2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e90374_fig02.png"/></fig><p>Content items evaluated the correctness and completeness of the generated summaries relative to the original clinical text and were also scored binary (yes or no). When content was rated as incorrect (no), participants were required to assign one or more predefined error categories: missing information (clinically relevant information present in the source text but absent from the summary), irrelevant details (information present in the source text but not requested or inappropriate for the section, including excessive or misplaced detail), or hallucinations (information not present in the original clinical text).</p><p>The global score question (n=1) asked evaluators to provide a global rating of the summary quality on a 4-point scale: <italic>very bad, bad, good,</italic> and <italic>very good</italic>. Finally, an open question (n=1) invited evaluators to add comments, provide suggestions, or note blind spots in the original letter that were not captured by the prompt.</p><p>This structured approach ensured a comprehensive assessment of both structural fidelity and semantic accuracy. <xref ref-type="fig" rid="figure2">Figure 2</xref> provides an overview of the evaluation question types. The set of evaluation questionnaire, including all layout and content items, is provided in Appendix 2.2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><sec id="s2-7-1"><title>General Approach</title><p>Descriptive statistics were computed to characterize participants, organizational affiliation, and medical specialty, expressed as frequencies (n) and percentages. In addition, descriptive analyses summarized the overall performance on layout and content questions, both globally and stratified by category.</p></sec><sec id="s2-7-2"><title>Score Analysis</title><p>The global score question offered 4 ordinal response options: <italic>very bad</italic> (1), <italic>bad</italic> (2), <italic>good</italic> (3)<italic>,</italic> and <italic>very good</italic> (4). We deliberately selected a 4-point Likert scale without a neutral midpoint to avoid noncommittal responses and to encourage evaluators to make a clear judgment on the overall quality of each summary. These were reported as frequencies and percentages and visualized using heatmaps by participant and by discharge letter. To explore associations between summary quality and evaluation metrics, a mixed effects ordinal regression model was applied. The dependent variable was the global score (treated as ordinal, 1&#x2010;4), while predictors included the number of &#x201C;yes&#x201D; responses for content and layout questions (analyzed separately) and participant specialty. Participant identity was modeled as a random effect to account for intrarater correlation.</p></sec><sec id="s2-7-3"><title>Consensus Assessment</title><p>For each question pertaining to the same discharge letter, agreement among participants was examined. Responses were dichotomized as yes or no<italic>,</italic> without distinguishing the reason for a negative response in content-related items. Disagreement within a question-letter pair was recorded, and consensus was aggregated across dimensions (content and layout) and categories.</p></sec><sec id="s2-7-4"><title>Interrater Reliability</title><p>To evaluate consistency among raters, multiple measures of interrater agreement were used. Pairwise agreement was quantified using Cohen &#x03BA; for all rater pairs across content and layout questions. The median &#x03BA; values per rater were compared using the Kruskal-Wallis test, followed by Dunn post hoc test when significant differences were observed. The number and proportion of significant differences were reported (n, %). The intraclass correlation coefficient (ICC) was calculated for all questions combined (content treated as binary: yes or no) and for the global score assigned to each summary. ICC was recalculated after excluding outlier raters to assess robustness. For multirater agreement on content questions, Fleiss &#x03BA; was computed, both overall and by category. As Fleiss &#x03BA; lacks a standard CI, a 95% CI was estimated via bootstrapping (10,000 iterations). Results were visualized using heatmaps and box-violin plots for pairwise Cohen &#x03BA; and point plots with error bars for ICC and Fleiss &#x03BA; estimates.</p><p>To better understand discrepancies between agreement metrics, we calculated the prevalence index (PI) for all binary items. The PI measures how strongly responses lean toward &#x201C;yes&#x201D; or &#x201C;no,&#x201D; ranging from 0 (balanced) to 1 (highly skewed). High prevalence can lower Cohen &#x03BA; and inflate ICC because ICC is influenced by between-item variance rather than categorical balance. We therefore computed the PI both overall and per category for layout and content items to assess how prevalence contributed to differences between &#x03BA; and ICC. We also calculated the mean PI per category to capture item-level skewness. While the overall PI reflects global imbalance across the dataset, the mean PI shows how consistently individual items display skewed response patterns. Reporting both helps distinguish whether prevalence effects stem from a few extreme items or from systematic imbalance across categories.</p></sec></sec><sec id="s2-8"><title>Sensitivity Analysis</title><p>A sensitivity analysis was performed comparing results from all raters with those obtained exclusively from the physician cohort. Further details can be found in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p></sec><sec id="s2-9"><title>Qualitative Analysis</title><p>Responses to the open-ended question were analyzed manually and summarized thematically to identify recurring topics and suggestions.</p></sec><sec id="s2-10"><title>Ethical Considerations</title><p>This study involved the evaluation of LLM-generated summaries of <italic>synthetically modified</italic> discharge letters. All procedures complied with institutional and national regulations governing the secondary use of clinical data for research.</p><list list-type="order"><list-item><p>Human subjects research ethics review, exemptions, and approvals: The protocol for generating and using synthetic clinical discharge letters was reviewed and approved by the Ethical Committee of Ghent University Hospital (reference ONZ20240273 and ONZ20240304). The Ethical Committees of AZ Sint-Lucas Ghent and AZ Oudenaarde were informed of the study in accordance with institutional governance requirements. The Committee determined that the study met applicable national and institutional regulatory requirements governing secondary use of health data. The institutional Data Protection Officer (DPO) was formally consulted during protocol preparation. The DPO reviewed the planned data flow, synthetic transformation procedures, and external private cloud deployment. Given that only synthetically transformed data were processed externally, the DPO and Ethics Committee determined that the study did not meet the institutional threshold for a full Data Protection Impact Assessment. The residual privacy risk was classified as low and proportionate to the research objectives.</p></list-item><list-item><p>Informed consent or waiver language: The source documents used to generate the synthetic letters originated from previously collected hospital records of deceased patients. For this retrospective secondary use, the Ethics Committee granted approval and waived the requirement for additional patient consent. The Committee determined that, given the exclusive inclusion of deceased patients, the retrospective design, and the implementation of structured deidentification and synthetic transformation procedures prior to further processing, the study did not require renewed consent. Although no anonymization method can guarantee absolute elimination of risk, all direct identifiers were removed, and the transformation process was designed to minimize any potential risk of reidentification.</p></list-item><list-item><p>Privacy and confidentiality protection: All clinical discharge letters underwent a rigorous structured deidentification and synthetic transformation procedure by protocol before use in the evaluation tool, including the removal of direct identifiers and the generalization or modification of indirect identifiers, temporal shifting of dates, modification of clinical elements when necessary to reduce reidentification risk, and internal review of transformed letters to assess residual linkage risk. These procedures were designed to preserve clinical realism while minimizing the likelihood of reidentification. Although no anonymization strategy can fully eliminate risk, the applied safeguards were considered proportionate to the study objectives and were reviewed as part of the approved protocol. The evaluation was conducted within a secure, access-controlled hospital environment with authenticated access and governance in accordance with institutional data protection standards.</p></list-item><list-item><p>Compensation: The study was supported by governmental research funding. Participating institutions received financial support to facilitate clinician participation in accordance with local institutional policies. Compensation arrangements were managed at the institutional level and reflected time commitment only; they were not dependent on performance or evaluation outcomes.</p></list-item><list-item><p>Protection of participant identity in figures and supplementary material: All clinician participants were assigned anonymized alphanumeric codes (eg, &#x201C;Z19&#x201D;) for all analyses and visualizations. Figures and appendices have been updated to replace any synthetic first names with anonymous codes. No identifiable images or personal information of participants are included in the manuscript or supplementary material. If identifiable material were to be included in future dissemination, explicit written consent would be obtained and submitted as required.</p></list-item></list></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Evaluation Tool</title><p>The workshop generated numerous suggestions for enhancing the FRAIT evaluation tool. A comprehensive list of implemented features is provided in Appendix 5.1 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. Examples include keyboard shortcuts for navigation, progress indicators, autosave functionality, and text highlighting. <xref ref-type="fig" rid="figure3">Figure 3</xref> illustrates the main components of the evaluation interface, while additional screenshots and examples of the content question user interface are included in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Screenshot of the evaluation tool highlighting its main components.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e90374_fig03.png"/></fig></sec><sec id="s3-2"><title>Evaluation Outcomes</title><p>Across all layout-related questions, 88% of responses were rated as &#x201C;yes,&#x201D; with the highest proportion observed in the <italic>conclusion</italic> section. For content-related questions, 78% of responses were affirmative, with the <italic>medical history</italic> category achieving the highest score. In contrast, the <italic>medication</italic> section demonstrated the lowest positive response rate and the highest proportion of hallucinations. Overall, hallucinations accounted for 3% of content-related responses. When content questions were answered &#x201C;no,&#x201D; missing information was the predominant reason, particularly in the <italic>conclusion</italic> section. The <italic>general</italic> section contained the largest proportion of irrelevant details (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Percentage of answers divided by content and layout and by category.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom" colspan="4">Domain: content (%)</td><td align="left" valign="bottom">Domain: layout (%)</td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top">Yes (total=77.5)</td><td align="left" valign="top">No missing information (total=15.6)</td><td align="left" valign="top">No irrelevant details (total=4.8)</td><td align="left" valign="top">No hallucination (total=3.1)</td><td align="left" valign="top">Yes (total=88.4)</td></tr><tr><td align="left" valign="top">General</td><td align="left" valign="top">72.8</td><td align="left" valign="top">3.4</td><td align="left" valign="top">20.1</td><td align="left" valign="top">3.8</td><td align="left" valign="top">90.3</td></tr><tr><td align="left" valign="top">Medical history</td><td align="left" valign="top">83.7</td><td align="left" valign="top">12.6</td><td align="left" valign="top">1.8</td><td align="left" valign="top">1.9</td><td align="left" valign="top">83.8</td></tr><tr><td align="left" valign="top">Investigations</td><td align="left" valign="top">74.1</td><td align="left" valign="top">22.9</td><td align="left" valign="top">2.1</td><td align="left" valign="top">0.9</td><td align="left" valign="top">94.3</td></tr><tr><td align="left" valign="top">Hospital course</td><td align="left" valign="top">81.1</td><td align="left" valign="top">12.8</td><td align="left" valign="top">4.4</td><td align="left" valign="top">1.7</td><td align="left" valign="top">93.2</td></tr><tr><td align="left" valign="top">Medication</td><td align="left" valign="top">67.8</td><td align="left" valign="top">18.1</td><td align="left" valign="top">6.3</td><td align="left" valign="top">7.8</td><td align="left" valign="top">83.2</td></tr><tr><td align="left" valign="top">Follow-up</td><td align="left" valign="top">80</td><td align="left" valign="top">11.2</td><td align="left" valign="top">3.6</td><td align="left" valign="top">5.2</td><td align="left" valign="top">87.7</td></tr><tr><td align="left" valign="top">Conclusion</td><td align="left" valign="top">68.4</td><td align="left" valign="top">25.9</td><td align="left" valign="top">4.6</td><td align="left" valign="top">1.2</td><td align="left" valign="top">97.8</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Global Scores</title><p>Nearly 70% of participants rated summaries as <italic>good</italic> or <italic>very good</italic>. A detailed visual overview is presented in Appendix 6 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>. <xref ref-type="fig" rid="figure4">Figure 4</xref> displays an ordered heatmap of global scores (1-4) by participant and discharge letter.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Heatmap of the global scores by participant (y-axis) and generated summary (x-axis).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e90374_fig04.png"/></fig><p>Mixed effect ordinal regression results are summarized in <xref ref-type="table" rid="table3">Table 3</xref>. The proportion of &#x201C;yes&#x201D; responses for content questions was significantly associated with higher global scores (&#x03B2;=.079; <italic>P</italic>&#x003C;.001), whereas layout performance showed no significant effect. Participant specialty did not exhibit a significant influence on scoring.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Mixed effects ordinal regression on global score versus content, layout, and participant specialty.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Predictor</td><td align="left" valign="top">&#x03B2; (SE)</td><td align="left" valign="top">z-score</td><td align="left" valign="top"><italic>P</italic> value</td><td align="left" valign="top">95% CI</td></tr></thead><tbody><tr><td align="left" valign="top">Content<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="char" char="." valign="top">.079 (0.008)</td><td align="char" char="." valign="top">9.781</td><td align="char" char="." valign="top">&#x003C;.001</td><td align="char" char="." valign="top">0.063 to 0.095</td></tr><tr><td align="left" valign="top">Layout<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="char" char="." valign="top">.006 (0.011)</td><td align="char" char="." valign="top">0.558</td><td align="char" char="." valign="top">.58</td><td align="char" char="." valign="top">&#x2212;0.016 to 0.028</td></tr><tr><td align="left" valign="top">Specialization</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GP<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">Reference</td><td align="left" valign="top">Reference</td><td align="left" valign="top">Reference</td><td align="left" valign="top">Reference</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Specialist</td><td align="char" char="." valign="top">&#x2212;0.404 (0.380)</td><td align="char" char="." valign="top">&#x2212;1.062</td><td align="char" char="." valign="top">.29</td><td align="char" char="." valign="top">&#x2212;1.149 to 0.341</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other caregiver</td><td align="char" char="." valign="top">&#x2212;0.544 (0.774)</td><td align="char" char="." valign="top">&#x2212;0.703</td><td align="char" char="." valign="top">.48</td><td align="char" char="." valign="top">&#x2212;2.061 to 0.973</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Yes on the content question (%).</p></fn><fn id="table3fn2"><p><sup>b</sup>Yes on the layout questions (%).</p></fn><fn id="table3fn3"><p><sup>c</sup> GP=general practitioner.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Consensus</title><p>Consensus for layout questions was high, with a median of 96.77% (IQR 90.22&#x2010;100.00). Content questions showed lower agreement (median 83.87%, IQR 67.74&#x2010;96.77). The overall consensus of both categories was 93.55% (IQR 80.65&#x2010;100.00; Appendix 7 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>). When examining the items in detail, consensus levels were substantially lower for the no categories (Appendix 7 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>, Figure 7.2 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>).</p><p>Category-level analysis showed that the <italic>medication</italic> section had the lowest consensus for both layout and content, while the <italic>conclusion</italic> section achieved perfect agreement for layout but lower for content (<xref ref-type="table" rid="table4">Table 4</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Consensus by question category and type.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question category</td><td align="left" valign="bottom">Content, median (IQR)</td><td align="left" valign="bottom">Layout, median (IQR)</td><td align="left" valign="bottom">All<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> (content % layout), median (IQR)</td></tr></thead><tbody><tr><td align="left" valign="top">General</td><td align="left" valign="top">82.3 (61.3&#x2010;96.8)</td><td align="left" valign="top">96.8 (83.9&#x2010;100)</td><td align="left" valign="top">96.8 (77.4&#x2010;100)</td></tr><tr><td align="left" valign="top">Medical history</td><td align="left" valign="top">91.9 (74.2&#x2010;100)</td><td align="left" valign="top">96.8 (90.3&#x2010;100)</td><td align="left" valign="top">96.8 (87.1&#x2010;100)</td></tr><tr><td align="left" valign="top">Medication</td><td align="left" valign="top">69.4 (58.1&#x2010;83.9)</td><td align="left" valign="top">93.5 (80.6&#x2010;100)</td><td align="left" valign="top">87.1 (71.0&#x2010;96.8)</td></tr><tr><td align="left" valign="top">Investigations</td><td align="left" valign="top">80.6 (65.3&#x2010;90.3)</td><td align="left" valign="top">100 (96.8&#x2010;100)</td><td align="left" valign="top">93.5 (74.2&#x2010;100)</td></tr><tr><td align="left" valign="top">Hospital course</td><td align="left" valign="top">88.7 (74.2&#x2010;93.5)</td><td align="left" valign="top">100 (93.5&#x2010;100)</td><td align="left" valign="top">96.8 (87.1&#x2010;100)</td></tr><tr><td align="left" valign="top">Follow-up</td><td align="left" valign="top">87.1 (71.0&#x2010;96.8)</td><td align="left" valign="top">96.8 (90.3&#x2010;100)</td><td align="left" valign="top">93.5 (87.1&#x2010;100)</td></tr><tr><td align="left" valign="top">Conclusion</td><td align="left" valign="top">82.3 (64.5&#x2010;93.5)</td><td align="left" valign="top">100 (96.8&#x2010;100)</td><td align="left" valign="top">98.4 (89.5&#x2010;100)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Combination of content and layout questions.</p></fn></table-wrap-foot></table-wrap><p>Additional information can be found in Appendix 7 (Figure 7.3 and Figure 7.4) in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>, which presents the consensus for each summary and question.</p></sec><sec id="s3-5"><title>Interrater Agreement</title><p>Pairwise Cohen &#x03BA; values ranged from 0.07 to 0.56, with a median of 0.36 (IQR: 0.29&#x2010;0.43), indicating generally low agreement among raters (<xref ref-type="fig" rid="figure5">Figure 5</xref>; Appendix 8.1 in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>). Participant Z31 demonstrated notably lower agreement compared to others. A Dunn post hoc test confirmed significant differences for this participant (Appendix 8.2 in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>).</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Pairwise Cohen &#x03BA; between all participants. Diamonds represent median &#x03BA; for each interrater; triangles represent mean pairwise Cohen &#x03BA;. Z31=participant Z31.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e90374_fig05.png"/></fig><p>The ICC for all questions combined was 0.945 (95% CI 0.942&#x2010;0.948). Layout questions achieved slightly higher ICC (0.961, 95% CI 0.958&#x2010;0.963), while content questions treated as binary yielded 0.908 (95% CI 0.899&#x2010;0.916). Excluding participant Z31 did not produce a significant difference in the ICC (Appendix 8.3 in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>), so we continued including this participant in our analysis.</p><p>The ICC for global scores was 0.776 (95% CI 0.649&#x2010;0.875). Fleiss &#x03BA; for content questions was lower (0.221, 95% CI 0.205&#x2010;0.237), with category-specific values ranging from 0.198 to 0.240 (<xref ref-type="table" rid="table5">Table 5</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Interrater overview: intraclass correlation coefficient (ICC) for binary content and layout questions and Fleiss &#x03BA; for categorical content questions.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Methods and group</td><td align="left" valign="top">Cohen &#x03BA;</td><td align="left" valign="top">95 % CI</td></tr></thead><tbody><tr><td align="left" valign="top">ICC</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Content (binary) and layout</td><td align="char" char="." valign="top">0.945</td><td align="char" char="." valign="top">0.942&#x2010;0.948</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Layout</td><td align="char" char="." valign="top">0.961</td><td align="char" char="." valign="top">0.958&#x2010;0.963</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Content&#x2014;binary (yes/no)</td><td align="char" char="." valign="top">0.908</td><td align="char" char="." valign="top">0.899&#x2010;0.916</td></tr><tr><td align="left" valign="top">Fleiss</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Content&#x2014;categorical</td><td align="char" char="." valign="top">0.221</td><td align="char" char="." valign="top">0.205&#x2010;0.237</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>OK</td><td align="char" char="." valign="top">0.240</td><td align="char" char="." valign="top">0.223&#x2010;0.257</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing information</td><td align="char" char="." valign="top">0.198</td><td align="char" char="." valign="top">0.177&#x2010;0.218</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Irrelevant details</td><td align="char" char="." valign="top">0.202</td><td align="char" char="." valign="top">0.170&#x2010;0.232</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hallucinations</td><td align="char" char="." valign="top">0.235</td><td align="char" char="." valign="top">0.173&#x2010;0.293</td></tr></tbody></table></table-wrap><p>Further category-specific ICC analysis (<xref ref-type="table" rid="table6">Table 6</xref>; Appendix 8.4 in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>) confirmed patterns observed in descriptive consensus measures. Interrater reliability was excellent across most sections, with median ICCs around 0.92 for the majority of items. The highest reliability was observed in the <italic>medical history</italic>, <italic>follow-up</italic>, and <italic>conclusion</italic> sections, where ICCs reached up to 0.96. The medication section showed comparatively lower, but still substantial, reliability for content (ICC=0.83). Notably, the <italic>conclusion</italic> section had a lower ICC for layout (0.65), indicating more variability among raters in assessing structural aspects of this section.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Intraclass correlation coefficient layout and content question-by-question category.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question category</td><td align="left" valign="bottom">Content</td><td align="left" valign="bottom">Layout</td><td align="left" valign="bottom">Content and layout</td></tr></thead><tbody><tr><td align="left" valign="top">General</td><td align="left" valign="top">0.927 (0.902&#x2010;0.947)</td><td align="left" valign="top">0.904 (0.887&#x2010;0.920)</td><td align="left" valign="top">0.927 (0.916&#x2010;0.938)</td></tr><tr><td align="left" valign="top">Medical history</td><td align="left" valign="top">0.923 (0.910&#x2010;0.935)</td><td align="left" valign="top">0.977 (0.974&#x2010;0.980)</td><td align="left" valign="top">0.964 (0.960&#x2010;0.968)</td></tr><tr><td align="left" valign="top">Medication</td><td align="left" valign="top">0.825 (0.777&#x2010;0.867)</td><td align="left" valign="top">0.938 (0.926&#x2010;0.948)</td><td align="left" valign="top">0.919 (0.907&#x2010;0.930)</td></tr><tr><td align="left" valign="top">Investigations</td><td align="left" valign="top">0.870 (0.838&#x2010;0.898)</td><td align="left" valign="top">0.934 (0.918&#x2010;0.948)</td><td align="left" valign="top">0.919 (0.905&#x2010;0.931)</td></tr><tr><td align="left" valign="top">Hospital course</td><td align="left" valign="top">0.897 (0.868&#x2010;0.922)</td><td align="left" valign="top">0.915 (0.899&#x2010;0.929)</td><td align="left" valign="top">0.917 (0.904&#x2010;0.928)</td></tr><tr><td align="left" valign="top">Follow-up</td><td align="left" valign="top">0.900 (0.875&#x2010;0.922)</td><td align="left" valign="top">0.974 (0.970&#x2010;0.979)</td><td align="left" valign="top">0.955 (0.949&#x2010;0.961)</td></tr><tr><td align="left" valign="top">Conclusion</td><td align="left" valign="top">0.931 (0.904&#x2010;0.954)</td><td align="left" valign="top">0.654 (0.559&#x2010;0.737)</td><td align="left" valign="top">0.953 (0.942&#x2010;0.962)</td></tr></tbody></table></table-wrap><p>Further inspection of the medication section showed that lower agreement among raters was partly associated with ambiguous prompt wording, particularly instructions such as &#x201C;show temporary medication.&#x201D; These ambiguities led evaluators to apply different interpretations of what information should be included in the summary, contributing to reduced consensus even in cases where the model output itself was not incorrect. This indicates that both the characteristics of the model output and the clarity of the prompt influenced the observed agreement patterns in this section.</p></sec><sec id="s3-6"><title>PI Analysis</title><p>The PI revealed substantial differences between layout and content items. Layout questions exhibited a high degree of prevalence skew, with an overall PI of 0.768 and a mean PI of 0.856 (SD 0.211). Several categories showed PI values above 0.90, indicating that layout-related items were almost always rated identically (typically &#x201C;yes&#x201D;), resulting in limited variability across raters. Content items demonstrated more balanced response distributions, with an overall PI of 0.550 and a mean PI of 0.622 (SD 0.316). Category-specific PI values ranged from 0.430 (medication) to 0.720 (medical history), reflecting greater heterogeneity in item difficulty and a broader range of rater judgments.</p><p>A detailed overview of PI values per category is presented in <xref ref-type="table" rid="table7">Table 7</xref>.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Overview of prevalence index (PI) values for layout and content items by question category.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Layout PI<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup>, mean (SD)</td><td align="left" valign="bottom">Content PI<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup>, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">General</td><td align="left" valign="top">0.824 (0.244)</td><td align="left" valign="top">0.573 (0.351)</td></tr><tr><td align="left" valign="top">Conclusion</td><td align="left" valign="top">0.956 (0.087)</td><td align="left" valign="top">0.573 (0.294)</td></tr><tr><td align="left" valign="top">Follow-up</td><td align="left" valign="top">0.886 (0.160)</td><td align="left" valign="top">0.665 (0.274)</td></tr><tr><td align="left" valign="top">Medication</td><td align="left" valign="top">0.754 (0.258)</td><td align="left" valign="top">0.430 (0.282)</td></tr><tr><td align="left" valign="top">Investigations</td><td align="left" valign="top">0.905 (0.196)</td><td align="left" valign="top">0.545 (0.300)</td></tr><tr><td align="left" valign="top">Hospital course</td><td align="left" valign="top">0.875 (0.227)</td><td align="left" valign="top">0.675 (0.282)</td></tr><tr><td align="left" valign="top">Medical history</td><td align="left" valign="top">0.864 (0.178)</td><td align="left" valign="top">0.720 (0.317)</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>Overall PI: 0.768; PI, mean (SD): 0.856 (0.211).</p></fn><fn id="table7fn2"><p><sup>b</sup>Overall PI: 0.550; PI, mean (SD): 0.622 (0.316).</p></fn></table-wrap-foot></table-wrap><p>These findings confirm that layout items were highly homogeneous, while content items displayed more meaningful variability. Importantly, the high PI values for layout items help explain the discrepancy between the relatively low median Cohen &#x03BA; and the high ICC: skewed response distributions can suppress &#x03BA;, whereas ICC remains elevated because it is driven by between-item variance rather than categorical balance.</p><p>Finally, responses to the open-ended question were analyzed qualitatively and summarized by topic (<xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study examined expert evaluation and interrater agreement in the assessment of LLM-generated discharge summaries, using the FRAIT framework, to assess structural fidelity, semantic accuracy, and usability.</p><p>Regarding the evaluation of structural integrity, layout adherence was consistently high, whereas content accuracy, particularly for medication details, remains a critical challenge. These findings align with prior research on clinical text summarization, which identifies medication and follow-up instructions as frequent sources of error and hallucination [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Although layout adherence was 88%, the remaining 12% reflects structural inconsistencies that may hinder automated processing and pose challenges for electronic health record integration. Even infrequent deviations from required section formatting can disrupt downstream workflow steps, indicating that layout performance should be considered a potential vulnerability rather than a strength.</p><p>The strong association between the positive response rate for content and global scores underscores the importance of semantic integrity in clinical documentation. Evaluators prioritized correctness over format, reinforcing FRAIT&#x2019;s emphasis on clinically meaningful metrics rather than purely structural criteria. Future iterations should therefore incorporate automated validation of medication-related information. This also aligns with the context-aware evaluation by Agrawal et al [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Interrater reliability analysis revealed high ICC values for global scores but low pairwise agreement for individual questions, especially in complex categories such as medication. Our PI findings confirm that the discrepancy between item-level &#x03BA; and overall ICC is partly attributable to prevalence skew in layout items, rather than true inconsistency in evaluator behavior. This suggests that while evaluators share similar global impressions, item-level judgments are subject to interpretation variability, a challenge also noted in previous discharge summary studies. A deeper analysis of the medication section revealed that ambiguous prompts (eg, &#x201C;show temporary medication&#x201D;) contributed to low agreement.</p><p>The medication section warrants particular attention, as it exhibited the highest hallucination rate (7.8%) among all content categories. A deeper review showed, however, that most of these hallucinations were related to start, stop, or change medication status rather than incorrect drug content. For example, in one case, vitamin D was listed on admission but not repeated in the discharge medication; the model labeled it as &#x201C;unchanged,&#x201D; which some raters considered a hallucination because no corresponding entry was present. This section also showed the lowest positive response rate and weakest interrater consensus, reflecting both its clinical complexity and the fact that some prompt instructions were interpreted differently by different raters. These findings highlight that LLM-generated medication summaries cannot be relied upon without human validation. When using this model and prompt configuration, human-in-the-loop verification remains essential to ensure clinical safety, and future implementations should include targeted safeguards, such as automated checks for omissions and medication-related inconsistencies. This highlights the need for clearer, context-specific prompts to optimize both LLM output and evaluation consistency, as suggested by Borse et al [<xref ref-type="bibr" rid="ref11">11</xref>]. Follow-up workshops were organized to refine prompt design and ensure unambiguous interpretation, which is essential for improving interrater agreement.</p><p>The PI analysis clarifies the discrepancy between the moderate median Cohen &#x03BA; (0.36) and the high ICC (0.945). Layout items showed consistently high PI values, indicating minimal variability and near-uniform &#x201C;yes&#x201D; responses. Such skewed distributions suppress &#x03BA; (the prevalence paradox [<xref ref-type="bibr" rid="ref12">12</xref>]) while inflating ICC, which is driven by between-item variance rather than categorical balance. Content items showed lower and more variable PI values, reflecting a more balanced mix of responses and greater item-level complexity. This explains why &#x03BA; was lower for content judgments: raters disagreed more frequently on clinically nuanced items, although overall rating patterns remained consistent enough to keep ICC high. Together, these findings show that the high ICC partly reflects the high prevalence of affirmative responses in structural items rather than purely strong interrater consistency. Reporting PI alongside &#x03BA; and ICC therefore provides a more accurate basis for interpreting agreement patterns.</p><p>Usability enhancements generated during the workshop, such as keyboard shortcuts, progress indicators, and autosave, reflect user-centered design principles and are expected to improve efficiency and reduce evaluator fatigue. These improvements support FRAIT&#x2019;s goal of creating a scalable, reproducible evaluation environment for clinical LLMs.</p><p>An additional observation concerns infrastructure and security considerations. Although these aspects were outside the scope of this article, they represented the largest portion of the project&#x2019;s budget. Establishing a secure, cloud-based environment was essential to ensure compliance with data protection standards and enable safe collaboration. This underscores the resource-intensive nature of deploying LLMs in clinical settings and highlights the importance of planning for robust infrastructure early in implementation.</p><p>While our findings confirm earlier reports of promising outcomes for LLMs in clinical documentation [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>], infrastructure and human factors remain critical determinants of success. Addressing these challenges will be essential for scaling LLM-based solutions in health care.</p></sec><sec id="s4-2"><title>Limitations</title><p>A key limitation of this study is the exclusive use of a single standardized prompt and a single LLM (GPT-4o). Although GPT-4o is recognized for its advanced capabilities, our findings should not be generalized to other model families, including open-weight systems or medical-specific models, nor to alternative prompting strategies. While this controlled design was necessary to ensure consistent comparison across evaluators, it also restricts the generalizability of our findings. LLM performance evolves rapidly, and outputs can vary substantially across models, model versions, and prompt formulations. As a result, this study should be interpreted primarily as a validation of the FRAIT evaluation framework, rather than a definitive benchmark of LLM summarization performance.</p><p>As our task is clinical summarization, it is plausible that smaller, domain-specific models trained on medical corpora could outperform general-purpose models on factuality, medication handling, and terminology precision while also reducing hallucination risk and computational overhead. The growing adoption of domain-specific models, such as Meditron and HuatuoGPT, which have demonstrated strong performance in clinical contexts [<xref ref-type="bibr" rid="ref1">1</xref>], suggests that specialized models may offer advantages in higher positive response rates and interpretability that general-purpose models cannot match. Furthermore, larger general-purpose LLMs may introduce unnecessary complexity and increase the risk of hallucinations compared to smaller, clinically optimized models, as noted by Lehman et al [<xref ref-type="bibr" rid="ref14">14</xref>]. These considerations highlight the need for comparative studies involving multiple LLMs to better understand the generalizability, safety, and optimal balance between model size and specialization in health care applications. We did not test such models here. Therefore, our results may underestimate the attainable accuracy and safety for targeted deployments.</p><p>Our evaluation used expert-curated synthetic discharge letters, developed to maintain clinical plausibility while omitting direct identifiers. The original section structure and general layout were largely maintained; however, the transformation process may have reduced certain real-world irregularities such as incomplete phrasing, extraneous noise, or ambiguous references that commonly occur in routine documentation. This distribution shift may have resulted in inflated performance metrics, particularly regarding layout adherence, compared to those expected from more variable and unstructured real-world notes, and may similarly overstate content accuracy. Therefore, our results should be regarded as an upper bound established under controlled conditions, rather than indicative of field performance.</p><p>The free-text explanations accompanying &#x201C;no&#x201D; responses could not be systematically analyzed because many raters did not follow the intended instructions for marking errors. As a result, the raw annotations were inconsistent in format and therefore unsuitable for reliable qualitative or quantitative reporting. Although the rater pool included clinicians from different specialties, it may not fully capture the diversity of clinical perspectives. Variability in interpretation between GPs and hospital physicians, especially for nuanced categories such as medication details, could have influenced agreement scores and overall evaluations. Ensuring broader representation across specialties and experience levels would strengthen the reliability of future assessments.</p><p>Certain sections of the discharge summaries, such as the conclusion, contained limited data. This imbalance restricted the robustness of evaluation for those components and may have skewed the overall usability and accuracy metrics. Future research should aim for more balanced datasets to enable thorough analysis across all content categories. Furthermore, as multiple layout categories exhibited PI values exceeding 0.90, certain reliability measures, especially &#x03BA;, should be interpreted with caution because high prevalence can restrict their capacity to distinguish genuine agreement from systematic response patterns.</p><p>Analysis of pairwise Cohen &#x03BA; identified one rater whose agreement consistently differed from that of the other evaluators. A post hoc review indicated that delays in initiating and completing the evaluation tasks may have contributed to this variability. This finding highlights the importance of voluntary participation and adequate protected time for expert evaluation to support data quality integrity in future studies. Notably, overall interrater reliability remained high despite this variability, suggesting that the evaluation approach was robust under routine clinical conditions.</p><p>A reliable measure of time-on-task could not be obtained because participants were free to pause or take breaks while completing evaluations. Although the platform recorded start and end times, these time stamps do not reflect actual working time and therefore cannot be used to assess efficiency. This limits our ability to draw conclusions about the workflow impact of the evaluation process.</p></sec><sec id="s4-3"><title>Implications and Future Work</title><p>The findings of this study have several practical implications for the integration of LLMs into clinical workflows. While LLMs show promise in assisting with documentation tasks, they should not replace expert review. Human oversight remains essential to ensure patient safety and compliance with clinical standards. Automated quality assurance mechanisms, particularly those targeting omissions and hallucinations in critical areas, such as medication and follow-up instructions, will be vital for real-world deployment.</p><p>Prompt design emerged as a key factor influencing evaluation consistency. Ambiguous or poorly defined prompts contributed to variability in interrater agreement and model output quality. Future research should therefore prioritize the development of clear, context-specific prompts and explore adaptive prompt optimization strategies to enhance reliability.</p><p>Model refinement is another important area for future work. Comparative evaluations of multiple LLMs, including domain-specific models, are needed to identify configurations that balance positive response rate, interpretability, and computational efficiency. Additionally, longitudinal studies should assess the real-world impact of LLM-assisted documentation on clinical workflows, patient outcomes, and resource allocation. These investigations will provide critical insights into the scalability and sustainability of LLM integration in health care.</p><p>To ensure standardization, the evaluation was restricted to a single medical prompt, despite participants having previously developed personalized prompts in a separate workshop [<xref ref-type="bibr" rid="ref2">2</xref>]. As emphasized by Verma et al [<xref ref-type="bibr" rid="ref8">8</xref>], the value of summarization is maximized when outputs are tailored to the specific needs of different clinical roles and contexts. Customized summaries, adapted for specialists, general practitioners, or administrative staff, ensure that each user receives the most relevant and actionable information for their responsibilities. This approach not only improves usability but also supports safer and more effective integration of AI tools into diverse health care environments.</p><p>Building on recent insights from Croxford et al [<xref ref-type="bibr" rid="ref7">7</xref>], future research should explore the potential of LLMs not only as tools for generating clinical documentation but also as evaluators, or &#x201C;judges,&#x201D; of their own outputs. They propose that context-aware LLMs, when properly calibrated and validated, could assist in the assessment of clinical summaries by providing rapid, scalable, and consistent feedback on accuracy, completeness, and relevance. Integrating LLMs as judges within expert-driven frameworks such as FRAIT may enhance the efficiency of evaluation processes, support continuous quality improvement, and facilitate benchmarking across diverse clinical scenarios. However, this approach will require rigorous validation to ensure that automated judgments align with clinical standards and expert consensus and that potential biases or limitations are systematically addressed.</p></sec><sec id="s4-4"><title>Conclusions</title><p>The FRAIT framework demonstrates strong potential for evaluating LLM-generated discharge summaries, offering a structured approach that balances usability and clinical relevance. While layout fidelity was consistently high, content accuracy, particularly for medication details, remains a critical challenge. Interrater reliability findings highlight the need for clearer prompts and standardized evaluation criteria to reduce variability. Usability enhancements developed through co-creation with clinicians further support FRAIT&#x2019;s scalability and practical adoption. Overall, LLMs show promise as an aid in clinical documentation, but robust validation mechanisms and expert oversight are essential to ensure safety and accuracy. Future work should focus on prompt optimization, model refinement, and automated quality checks to advance reliable integration of LLMs into health care workflows.</p></sec></sec></body><back><ack><p>We would like to express our sincere gratitude to the entire team at the Data Science Institute of Ghent University Hospital, as well as to the management, program participants, and coordinators representing the participating organizations. Artificial Intelligence is used within the project to enhance efficiency: generating the summaries using GPT-4o (as stated within the <italic>Methods</italic> section). This manuscript was improved with the assistance of Enterprise Copilot.</p></ack><notes><sec><title>Funding</title><p>The "FRAIT" project was funded by the Belgian Government with the Data Innovation project.</p></sec><sec><title>Data Availability</title><p>The synthetic discharge letters analyzed in this study are not publicly available due to governance restrictions. Access to these materials can be requested through the institutional Data Access Committee of Ghent University Hospital, subject to approval and data use conditions.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: MD, HR, KC</p><p>Data curation: MD, HR</p><p>Formal analysis: MD</p><p>Funding acquisition: MD, KC</p><p>Investigation: MD, HR</p><p>Methodology: MD, KC</p><p>Project administration: MD, HR</p><p>Resources: MD, KC</p><p>Software: MD</p><p>Supervision: KC</p><p>Validation: KC</p><p>Visualization: MD, HR</p><p>Writing &#x2013; original draft: MD</p><p>Writing &#x2013; review &#x0026; editing: MD, HR, KC</p></fn><fn fn-type="conflict"><p>The authors are affiliated with the Data Science Institute of Ghent University Hospital, which developed the evaluation tool and model container used in this study. The authors declare that they have no financial or commercial interest in the tool and no intention to commercialize it. The work was conducted solely within an academic and clinical quality improvement context.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">DPO</term><def><p>Data Protection Officer</p></def></def-item><def-item><term id="abb2">FRAIT</term><def><p>Framework and Implementation of AI Tools</p></def></def-item><def-item><term id="abb3">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">PI</term><def><p>prevalence index</p></def></def-item><def-item><term id="abb6">QUEST</term><def><p>Quality of Information, Understanding and Reasoning, Expression Style and Persona, Safety and Harm, and Trust and Confidence</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shool</surname><given-names>S</given-names> </name><name name-style="western"><surname>Adimi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Saboori Amleshi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bitaraf</surname><given-names>E</given-names> </name><name name-style="western"><surname>Golpira</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tara</surname><given-names>M</given-names> </name></person-group><article-title>A systematic review of large language model (LLM) evaluations in clinical medicine</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>03</month><day>7</day><volume>25</volume><issue>1</issue><fpage>117</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-02954-4</pub-id><pub-id pub-id-type="medline">40055694</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deschepper</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rogge</surname><given-names>H</given-names> </name><name name-style="western"><surname>Syx</surname><given-names>M</given-names> </name><name name-style="western"><surname>Colpaert</surname><given-names>K</given-names> </name></person-group><article-title>Tailoring discharge summaries to health care providers&#x2019; needs (part 1 of the Framework and Implementation of AI Tools project): user-centered design approach</article-title><source>JMIR Med Inform</source><year>2026</year><month>03</month><day>4</day><volume>14</volume><fpage>e80613</fpage><pub-id pub-id-type="doi">10.2196/80613</pub-id><pub-id pub-id-type="medline">41813333</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tam</surname><given-names>TYC</given-names> </name><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A framework for human evaluation of large language models in healthcare derived from literature review</article-title><source>NPJ Digit Med</source><year>2024</year><month>09</month><day>28</day><volume>7</volume><issue>1</issue><fpage>258</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01258-7</pub-id><pub-id pub-id-type="medline">39333376</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Awasthi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mahapatra</surname><given-names>D</given-names> </name><etal/></person-group><article-title>HumanELY: human evaluation of LLM yield, using a novel web-based evaluation tool</article-title><source>Health Informatics</source><comment>Preprint posted online on 2024</comment><pub-id pub-id-type="doi">10.1101/2023.12.22.23300458</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Agrawal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>IY</given-names> </name><name name-style="western"><surname>Gulamali</surname><given-names>F</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>S</given-names> </name></person-group><article-title>The evaluation illusion of large language models in medicine</article-title><source>NPJ Digit Med</source><year>2025</year><month>10</month><day>7</day><volume>8</volume><issue>1</issue><fpage>600</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01963-x</pub-id><pub-id pub-id-type="medline">41057566</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Croxford</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>First</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Evaluating clinical AI summaries with large language models as judges</article-title><source>NPJ Digit Med</source><year>2025</year><month>11</month><day>5</day><volume>8</volume><issue>1</issue><fpage>640</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-02005-2</pub-id><pub-id pub-id-type="medline">41193667</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Verma</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alsentzer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Strasser</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Verifiable summarization of electronic health records using large language models to support chart review</article-title><source>medRxiv</source><year>2025</year><month>06</month><day>3</day><fpage>2025.06.02.25328807</fpage><pub-id pub-id-type="doi">10.1101/2025.06.02.25328807</pub-id><pub-id pub-id-type="medline">40502573</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>CYK</given-names> </name><name name-style="western"><surname>Subramanian</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>SS</given-names> </name><etal/></person-group><article-title>Physician- and large language model-generated hospital discharge summaries</article-title><source>JAMA Intern Med</source><year>2025</year><month>07</month><day>1</day><volume>185</volume><issue>7</issue><fpage>818</fpage><lpage>825</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2025.0821</pub-id><pub-id pub-id-type="medline">40323616</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rust</surname><given-names>P</given-names> </name><name name-style="western"><surname>Frings</surname><given-names>J</given-names> </name><name name-style="western"><surname>Meister</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fehring</surname><given-names>L</given-names> </name></person-group><article-title>Evaluation of a large language model to simplify discharge summaries and provide cardiological lifestyle recommendations</article-title><source>Commun Med</source><year>2025</year><month>05</month><day>29</day><volume>5</volume><issue>1</issue><pub-id pub-id-type="doi">10.1038/s43856-025-00927-2</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Borse</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Chatta Subramaniam</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rebello</surname><given-names>NS</given-names> </name></person-group><article-title>Investigation of the inter-rater reliability between large language models and human raters in qualitative analysis</article-title><year>Oct 28, 2025</year><conf-name>2025 Physics Education Research Conference</conf-name><pub-id pub-id-type="doi">10.1119/perc.2025.pr.Borse</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zec</surname><given-names>S</given-names> </name><name name-style="western"><surname>Soriani</surname><given-names>N</given-names> </name><name name-style="western"><surname>Comoretto</surname><given-names>R</given-names> </name><name name-style="western"><surname>Baldi</surname><given-names>I</given-names> </name></person-group><article-title>High agreement and high prevalence: the paradox of Cohen&#x2019;s kappa</article-title><source>Open Nurs J</source><year>2017</year><volume>11</volume><fpage>211</fpage><lpage>218</lpage><pub-id pub-id-type="doi">10.2174/1874434601711010211</pub-id><pub-id pub-id-type="medline">29238424</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tung</surname><given-names>JYM</given-names> </name><name name-style="western"><surname>Gill</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Sng</surname><given-names>GGR</given-names> </name><etal/></person-group><article-title>Comparison of the quality of discharge letters written by large language models and junior clinicians: single-blinded study</article-title><source>J Med Internet Res</source><year>2024</year><volume>26</volume><fpage>e57721</fpage><pub-id pub-id-type="doi">10.2196/57721</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lehman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Hernandez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Mahajan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wulff</surname><given-names>J</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>MJ</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ziegler</surname><given-names>Z</given-names> </name></person-group><article-title>Do we still need clinical language models? conference on health, inference, and learning</article-title><access-date>2026-05-05</access-date><conf-name>Conference on health, inference, and learning</conf-name><conf-date>Apr 26-28, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v209/eric23a.html?utm_source=chatgpt.com">https://proceedings.mlr.press/v209/eric23a.html?utm_source=chatgpt.com</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Quest flow.</p><media xlink:href="medinform_v14i1e90374_app1.pdf" xlink:title="PDF File, 874 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Evaluation questions by category.</p><media xlink:href="medinform_v14i1e90374_app2.pdf" xlink:title="PDF File, 127 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Protocol synthetic discharge summaries FRAIT.</p><media xlink:href="medinform_v14i1e90374_app3.pdf" xlink:title="PDF File, 172 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Sensitivity analysis.</p><media xlink:href="medinform_v14i1e90374_app4.pdf" xlink:title="PDF File, 138 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Evaluation tool&#x2014;extra feature list.</p><media xlink:href="medinform_v14i1e90374_app5.pdf" xlink:title="PDF File, 874 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Overview global score.</p><media xlink:href="medinform_v14i1e90374_app6.pdf" xlink:title="PDF File, 108 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Consensus.</p><media xlink:href="medinform_v14i1e90374_app7.pdf" xlink:title="PDF File, 1315 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Interrater agreement.</p><media xlink:href="medinform_v14i1e90374_app8.pdf" xlink:title="PDF File, 392 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Overview answer open question.</p><media xlink:href="medinform_v14i1e90374_app9.pdf" xlink:title="PDF File, 138 KB"/></supplementary-material></app-group></back></article>