<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e78838</article-id><article-id pub-id-type="doi">10.2196/78838</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluating GPT-4 Responses on Scars or Keloids for Patient Education: Large Language Model Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Rao</surname><given-names>Mingjun</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Xiujun</surname><given-names>Tang</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Haoyu</surname><given-names>Wang</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Department of Plastic Surgery, Guizhou Provincial People's Hospital</institution><addr-line>83 Zhongshan East Road, Nanming District</addr-line><addr-line>Guiyang</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Assun&#x00E7;&#x00E3;o</surname><given-names>B&#x00E1;rbara Aline Ferreira</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Fukuzawa</surname><given-names>Fumitoshi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Klewer</surname><given-names>J&#x00F6;rg</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Rawat</surname><given-names>Keshav Singh</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Tang Xiujun, MSc, Department of Plastic Surgery, Guizhou Provincial People's Hospital, 83 Zhongshan East Road, Nanming District, Guiyang, 550002, China, 86 15343315902; <email>xiujunsszx@163.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>27</day><month>2</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e78838</elocation-id><history><date date-type="received"><day>10</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>29</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Mingjun Rao, Tang Xiujun, Wang Haoyu. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 27.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e78838"/><abstract><sec><title>Background</title><p>Scars and keloids impose significant physical and psychological burdens on patients, often leading to functional limitations, cosmetic concerns, and mental health issues such as anxiety or depression. Patients increasingly turn to online platforms for information; however, existing web-based resources on scars and keloids are frequently unreliable, fragmented, or difficult to understand. Large language models such as GPT-4 show promise for delivering medical information, but their accuracy, readability, and potential to generate hallucinated content require validation for patient education applications.</p></sec><sec><title>Objective</title><p>This study aimed to systematically evaluate GPT-4&#x2019;s performance in providing patient education on scars and keloids, focusing on its accuracy, reliability, readability, and reference quality.</p></sec><sec sec-type="methods"><title>Methods</title><p>This study involved collecting 354 questions from Reddit communities (r/Keloids, r/SCAR, and r/PlasticSurgery), covering topics including treatment options, pre- and postoperative care, and psychological impacts. Each question was input into GPT-4 in independent sessions to mimic real-world patient interactions. Responses were evaluated using multiple tools: the Patient Education Materials Assessment Tool-Artificial Intelligence for understandability and actionability, DISCERN-AI for treatment information quality, the Global Quality Scale for overall information quality, and standard readability metrics (Flesch Reading Ease score, and Gunning Fog Index). Three plastic surgeons used the Natural Language Assessment Tool for Artificial Intelligence to rate the accuracy, safety, and clinical appropriateness, while the Reference Evaluation for Artificial Intelligence tool validated references for reference hallucination, relevance, and source quality. We conducted the same analysis to assess the quality of GPT-4&#x2013;generated content in response to questions from 3 medical websites.</p></sec><sec sec-type="results"><title>Results</title><p>GPT-4 demonstrated high accuracy and reliability. The Patient Education Materials Assessment Tool-Artificial Intelligence showed 75.5% understandability, DISCERN-AI rated responses as &#x201C;good&#x201D; (26.3/35), and the Global Quality Scale score was 4.28 of 5. Surgeons&#x2019; evaluations averaged 3.94 to 4.43 out of 5 across dimensions (accuracy 3.9, SD 0.7; safety 4.3, SD 0.8; clinical appropriateness 4.4, SD 0.5; actionability 4.1, SD 0.8; and effectiveness 4.1, SD 0.8). Readability analyses indicated moderate complexity (Flesch Reading Ease Score: 50.13; Gunning Fog Index: 12.68), corresponding to a 12th-grade reading level. Reference Evaluation for Artificial Intelligence identified 11.8% (383/3250) hallucinated references, while 88.2% (2867/3250) of references were real, with 95.1% (2724/2867) from authoritative sources (eg, government guidelines and the literature). The overall results about questions from medical websites were consistent with the answers to Reddit questions.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4 has serious potential as a patient education tool for scars and keloids, offering reliable and accurate information. However, improvements in readability (to align with sixth to eighth grade standards) and reduction of reference hallucinations are essential to enhance accessibility and trustworthiness. Future large language model optimizations should prioritize simplifying medical language and strengthening reference validation mechanisms to maximize clinical utility.</p></sec></abstract><kwd-group><kwd>scar</kwd><kwd>keloid</kwd><kwd>GPT-4</kwd><kwd>patient education</kwd><kwd>generative AI</kwd><kwd>generative artificial intelligence</kwd><kwd>readability</kwd><kwd>large language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Scars and keloids are common skin healing outcomes [<xref ref-type="bibr" rid="ref1">1</xref>], often causing discomfort during the proliferative phase [<xref ref-type="bibr" rid="ref2">2</xref>]. When located on visible areas such as the face, they can severely impact a patient&#x2019;s appearance, leading to psychological distress such as low self-esteem, anxiety, and depression, which may further hinder social interactions and career development [<xref ref-type="bibr" rid="ref3">3</xref>]. Scars near joints may cause functional limitations, while perineal scars can result in long-term complications such as dyspareunia and dysmenorrhea [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Many patients urgently seek to improve both the aesthetic and functional aspects of scars or keloids. However, treatments often require multimodal approaches over weeks to months, making high patient adherence crucial. Consequently, access to accurate, timely, and comprehensive management information is critical for patients to better understand treatment expectations, options, prognosis, and potential complications [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>Currently, patients increasingly rely on internet-based health information [<xref ref-type="bibr" rid="ref10">10</xref>]. This trend faces multifaceted challenges, including information overload, variability in source credibility and content accuracy, and the health literacy required to understand the contents [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Notably, many online resources on scars or keloids are often unreliable, fragmented, or difficult to understand, failing to tackle the fundamental needs of patients with scars or keloids [<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>In recent years, artificial intelligence (AI) tools powered by large language models (LLMs), such as GPT-4 (used by ChatGPT), have demonstrated significant potential in delivering medical information [<xref ref-type="bibr" rid="ref14">14</xref>]. GPT-4&#x2019;s capacity to generate natural language responses through interactive conversions could aid users in understanding intricate medical concepts, treatment, and management strategies, positioning it as a potentially valuable alternative to traditional search engines for accessing knowledge associated with scars or keloids [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>It is reported that 52% of US adults have used LLMs, and GPT-4, as a leading LLM, receives over 5 billion monthly visits. In total, 39% of LLM users have used LLMs for health care queries [<xref ref-type="bibr" rid="ref18">18</xref>]. Despite the increasing use of LLMs in health care [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>], there remains a research gap, and it is currently unclear whether GPT-4 can generate high-quality patient education content related to scars and keloids. Thus, we conducted this study to comprehensively assess the use of GPT-4 in keloid and scar patient education by performing a multidimensional evaluation (encompassing accuracy, reliability, readability, and reference quality) of GPT-4 responses.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Objective</title><p>This study aimed to investigate the potential of GPT-4 to provide reliable, accurate, readable, and actual medical information for patients with scars or keloids. To achieve this, we used GPT-4 (OpenAI) to evaluate its accuracy, reliability, readability, and hallucinations in answering questions related to treatments of scars or keloids.</p></sec><sec id="s2-2"><title>Question Collection</title><p>The research questions were manually collected by the authors from Reddit. First, the authors analyzed all posts on the &#x201C;Hot&#x201D; page (the most popular and recently active posts) of the r/Keloids subreddit, all posts on the r/SCAR subreddit, and all posts on the r/PlasticSurgery subreddit as of April 6, 2025. We extracted the main text of each post that included the keywords &#x201C;Scar&#x201D; or &#x201C;Keloid&#x201D; and organized the data using Microsoft Excel. Second, one author (MR) excluded the posts that contained no questions, were duplicates, or had repeated questions. In addition, the same author (MR) performed the initial classification of the questions. To ensure reliability and minimize bias, another author (WH) independently reviewed the process. Consistency between the two authors&#x2019; classifications was confirmed through discussion. This data collection approach has been adopted in previous Reddit-based research [<xref ref-type="bibr" rid="ref22">22</xref>]. Furthermore, we adopted 49 questions about keloids or scars from 3 medical websites.</p></sec><sec id="s2-3"><title>Ethical Considerations</title><p>The institutional review board of the People&#x2019;s Hospital of Guizhou Province, affiliated with Guizhou University, deemed this study exempt from ethics approval.</p></sec><sec id="s2-4"><title>Quality Assessment</title><sec id="s2-4-1"><title>Overview</title><p>Each question was input individually into GPT-4. Following previous research protocols, a new chat session was initiated for each question to avoid context contamination and to simulate real-world patient interactions [<xref ref-type="bibr" rid="ref23">23</xref>]. Consistent with real-world activities using GPT-4, no specialized prompt instructions were appended to the question inputs. The contents generated by ChatGPT-4 were evaluated using a modified version of existing health information quality assessment tools.</p></sec><sec id="s2-4-2"><title>Patient Education Materials Assessment Tool-AI Tool</title><p>The Patient Education Materials Assessment Tool (PEMAT) was used to assess the understandability and actionability of ChatGPT-generated content. The original PEMAT includes 17 items for understandability and 7 for actionability. Since all contents generated by ChatGPT are pure text, the PEMAT was simplified to 8 items for understandability and 3 items for actionability (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Each item received 1 point if it met the standard, and scores were reported as percentages. A score of 70% or greater was recorded as a &#x201C;pass&#x201D; per PEMAT guidelines [<xref ref-type="bibr" rid="ref24">24</xref>].</p></sec><sec id="s2-4-3"><title>DISCERN-AI Tool</title><p>The DISCERN standard, a previously validated tool to help health care consumers and professionals evaluate the quality of treatment information, was adapted for ChatGPT-generated content. Since all contents generated by ChatGPT are pure text, 7 items (questions 3&#x2010;9 from the 15-item DISCERN tool) were selected and scored on a 1 to 5 scale (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Each output was rated as follows: very poor (7&#x2010;12 points), poor (13&#x2010;17 points), fair (18&#x2010;23 points), good (24&#x2010;28 points), and excellent (29&#x2010;35 points) [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec><sec id="s2-4-4"><title>Global Quality Scale</title><p>The Global Quality Scale (GQS) is a 5-point Likert scale used to evaluate information quality and the flow and ease of use of information. The scores range from 1 (low quality) to 5 (high quality), while scores of 4 or 5 indicated high-quality outputs, a score of 3 was considered moderate quality and scores of 1 or 2 were categorized as low quality.</p></sec><sec id="s2-4-5"><title>Readability Assessment</title><p>The readability of the ChatGPT-generated content was evaluated using several established readability formulas, including Flesch Reading Ease score, Gunning Fog Index, Flesch-Kincaid Grade Level, Coleman-Liau Index, and Simple Measure of Gobbledygook (SMOG). Each output was copied into Microsoft Word and analyzed via the Readable website [<xref ref-type="bibr" rid="ref26">26</xref>]. The Flesch Reading Ease score ranges from 0 to 100, and higher scores indicate greater readability. A score between 60 and 70 corresponds to reading levels of grades 8 and 9 and is generally understandable by the average adult. The Gunning Fog Index and Flesch-Kincaid Grade Level are used to estimate sentence complexity; the scores represent the years of formal education required to understand the contents. For example, a score of 12 implies the output is suitable for readers at the 12th-grade level. The Coleman-Liau Index is similar to the Gunning Fog Index and Flesch-Kincaid Grade Level but uses character counts instead of syllables, making it more suitable for languages where syllable counts may not accurately reflect complexity. The SMOG Index measures syllable density, often used to assess health information materials. A score of 12 indicates that the material is suitable for readers at the 12th-grade level or higher.</p></sec><sec id="s2-4-6"><title>Natural Language Assessment Tool for Artificial Intelligence</title><p>Three experienced plastic surgeons independently reviewed each GPT-4&#x2013;generated content using a specially developed Natural Language Assessment Tool for Artificial Intelligence (NLAT-AI) [<xref ref-type="bibr" rid="ref24">24</xref>]. Using this tool, we assessed accuracy, safety, appropriateness, actionability, and effectiveness. Each output was rated using a 5-point Likert scale (1=strongly disagree, 5=strongly agree; <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). All results were summarized using descriptive statistics.</p></sec><sec id="s2-4-7"><title>Reference Evaluation for AI</title><p>Given known issues of LLM hallucination (ie, generating plausible but nonexistent references), a brief evaluation tool, Reference Evaluation for AI, was developed to analyze references provided in ChatGPT-generated content [<xref ref-type="bibr" rid="ref27">27</xref>]. Each reference was verified through direct links or a Google search. The tool assessed (1) reference hallucination (whether references were real or fabricated), (2) relevance and consistency between references and AI output, and (3) source quality (based on the authority of the issuing institution or organization, such as government guidelines, health care organizations, or scientific research; <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Question Collection and Classification</title><p>A total of 507 posts were identified and analyzed (posts from the r/Keloids subreddit: n=193, 38.1%; posts from the r/Keloids subreddit: n=211, 41.6%; and posts from the r/Scar subreddit: n=103, 20.3%). After removing posts that merely shared information or were duplicates, 354 unique questions were obtained. The questions were categorized into 16 groups based on their contents (<xref ref-type="table" rid="table1">Table 1</xref>). Furthermore, we obtained 49 questions from 3 medical websites that included 38 unique questions (Table S1 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Questions on scars or keloids from Reddit (N=354).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question group</td><td align="left" valign="bottom">Questions, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Questions on other respects</td><td align="left" valign="top">28 (7.9)</td></tr><tr><td align="left" valign="top">Questions on other treatments for scars or keloids</td><td align="left" valign="top">4 (1.1)</td></tr><tr><td align="left" valign="top">Questions on common treatments for scars or keloids</td><td align="left" valign="top">46 (13)</td></tr><tr><td align="left" valign="top">Questions on trauma-related scars or keloids</td><td align="left" valign="top">16 (4.5)</td></tr><tr><td align="left" valign="top">Questions on psychological issues caused by scars or keloids</td><td align="left" valign="top">9 (2.5)</td></tr><tr><td align="left" valign="top">Questions on at-home scar or keloid care</td><td align="left" valign="top">3 (0.8)</td></tr><tr><td align="left" valign="top">Questions on preoperative scar or keloid consultation</td><td align="left" valign="top">37 (10.5)</td></tr><tr><td align="left" valign="top">Questions on postoperative scar or keloid consultation</td><td align="left" valign="top">55 (15.5)</td></tr><tr><td align="left" valign="top">Questions on selection of treatments for scars or keloids</td><td align="left" valign="top">80 (22.6)</td></tr><tr><td align="left" valign="top">Questions on impact of scars or keloids on daily life</td><td align="left" valign="top">2 (0.6)</td></tr><tr><td align="left" valign="top">Questions on scar or keloid symptoms</td><td align="left" valign="top">7 (2)</td></tr><tr><td align="left" valign="top">Questions on scar camouflage</td><td align="left" valign="top">6 (1.7)</td></tr><tr><td align="left" valign="top">Questions on the impact of nutrition on scars or keloids</td><td align="left" valign="top">3 (0.8)</td></tr><tr><td align="left" valign="top">Questions on choosing physicians for scar or keloid treatment or related costs</td><td align="left" valign="top">32 (9)</td></tr><tr><td align="left" valign="top">Questions on old scars</td><td align="left" valign="top">14 (4)</td></tr><tr><td align="left" valign="top">Questions on scar or keloid prevention</td><td align="left" valign="top">12 (3.4)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Evaluation of GPT-4&#x2013;Generated Content</title><p>GPT-4 generated content that provided a wide range of medically accurate information. Using the PEMAT-AI, DISCERN-AI, and GQS patient education material evaluation tools, the output of GPT-4 was assessed, with all tools indicating high scores. The overall understandability score using PEMAT-AI easily surpassed the 70% threshold for acceptability (mean 75.5%, SD 12.2%). The DISCERN-AI tool resulted in an overall rating of &#x201C;good&#x201D; quality (mean 26.3, SD 3.4), with all 16 groups of questions rated as &#x201C;good.&#x201D; The GQS score averaged 4.3 out of 5 (SD 0.8), categorizing the outputs as high quality. More details are shown in Table S1 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>. Intraclass correlation coefficient (ICC) for PEMAT-AI, DISCERN-AI, and GQS were 0.73, 0.69, and 0.78, respectively (Table S2 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). The results of the ICC demonstrated high reliability of the evaluation tools.</p></sec><sec id="s3-3"><title>Plastic Surgeons&#x2019; Evaluation via the NLAT-AI Tool</title><p>Using the NLAT-AI tool, 3 independent plastic surgeons evaluated the GPT-4&#x2013;generated content. All dimensions of the contents received scores above the neutral midpoint of 3 on a 5-point Likert scale. The overall average scores for each dimension were as follows: accuracy 3.9 (SD 0.7), safety 4.3 (SD 0.8), appropriateness 4.4 (SD 0.5), actionability 4.1 (SD 0.8), and effectiveness 4.1 (SD 0.8). More detailed descriptive statistics for each question are presented in Table S2 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>. Internal validity tests showed an ICC of 0.76 (Table S2 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>), indicating high reliability.</p></sec><sec id="s3-4"><title>Readability Assessment</title><p>The results of the readability assessments indicated that the GPT-4&#x2013;generated content was &#x201C;difficult to read.&#x201D; The average Flesch Reading Ease score was 50.1 (SD 8.1), which is considered moderately difficult. The Gunning Fog Index averaged 12.7 (SD 3.3), and the Flesch-Kincaid Grade Level was 12.4 (SD 2.5), indicating that the text was at a high school level (approximately suitable for individuals aged 16&#x2010;17 years). The Coleman-Liau Index averaged 12.8 (SD 2.6), and the SMOG Index averaged 11.3 (SD 3.16). More detailed evaluation results are shown in Table S3 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p></sec><sec id="s3-5"><title>Reference Evaluation for AI Assessment</title><p>Most of the references provided in GPT-4&#x2019;s output effectively supported the content. A total of 88.2% (2867/3250) of the references were from actual sources (actual websites or academic papers), while 383 hallucinated references were identified. Among these 2867 real references, 2746 (95.8%) references effectively supported the content. In addition, a total of 95.1% (2724/2867) of the real references were from authoritative sources (government guidelines, health care organizations, or scientific research). More detailed evaluation results are shown in Table S4 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p></sec><sec id="s3-6"><title>The Assessment of Questions From Websites</title><p>The evaluation results of GPT-4 responses to website-sourced questions were broadly consistent with those from Reddit-derived questions across all assessments (Tables S3-S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This is the first study to assess the overall quality of ChatGPT responses to real-world questions from Reddit about keloids or scars. The results revealed that the content generated by GPT-4 was generally comprehensive and aligned with current medical guidelines and the literature. Using several assessment tools, as well as plastic surgeons&#x2019; evaluations, the scores were robust, and the plastic surgeons&#x2019; evaluations were largely positive. The overall results indicate that GPT-4&#x2013;generated content is reliable, accurate, safe, and actionable, despite there being room for improvement in terms of readability and hallucination.</p><p>Over 80% of dermatology outpatients obtain medical information through social media or the internet, with 47% considering it an important source of information [<xref ref-type="bibr" rid="ref28">28</xref>]. Although patients have access to a wealth of information, studies evaluating the quality of online health information have identified significant deficiencies [<xref ref-type="bibr" rid="ref29">29</xref>]. As for scars and keloids, the information available to patients contains a lot of low-quality content. A previous study assessing 88 websites related to &#x201C;burn scars&#x201D; showed that most of the commercial websites provided information of moderate to poor quality [<xref ref-type="bibr" rid="ref13">13</xref>]. In contrast, LLMs provide a broad range of fundamentally accurate information and real-time dynamic interactions compared to traditional webpages [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. As a leading LLM, GPT-4 exhibits certain advantages over other LLMs and has demonstrated top-tier performance across diverse evaluations in health care. In answering questions from the American Board of Surgery In-Training Examination, GPT-4 achieved an accuracy rate comparable to that of Copilot, while significantly outperforming Gemini [<xref ref-type="bibr" rid="ref32">32</xref>]. In other fields of clinical medicine, GPT-4 also attained superior performance relative to other LLMs [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. However, in a substantial number of evaluative scenarios, the performance of GPT-4 did not yield statistically significant differences when compared with Copilot or Gemini. Collectively, the performance of GPT-4 currently represents the best capability of LLMs.</p><p>In our study, experienced plastic surgeons evaluated the outputs of GPT-4, confirming that the contents were reliable and accurate. The accuracy of GPT-4 in patient education has also been studied in other clinical contexts (eg, rhinoplasty, sleep apnea, and prostate cancer) where it demonstrated high accuracy and strong reliability [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Such high accuracy and reliability suggest that LLMs such as GPT-4 can effectively address clinical questions from patients with scars or keloids, serving as a valuable auxiliary tool in clinical medicine.</p><p>Despite GPT-4&#x2019;s significant potential in responding to keloid or scar patient queries, its outputs commonly had high reading difficulty. Our study revealed that the average reading level of GPT-4&#x2013;generated content was at a high school level. The results suggest that ChatGPT does not always meet the comprehension needs of all patients. The relatively low readability of GPT-4 can hinder accessibility for certain socioeconomic populations with limited health literacy [<xref ref-type="bibr" rid="ref37">37</xref>]. Among the latest generation of young adults in the United States, up to 13% have not graduated from high school. This rate reaches 20% among people of color (including African Americans and Native Americans) [<xref ref-type="bibr" rid="ref38">38</xref>], who are also identified as high-risk groups for developing malignant scars [<xref ref-type="bibr" rid="ref39">39</xref>]. Due to poor readability, GPT-4 has apparent barriers in its application among these populations [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. To enhance the utility of LLMs for populations with lower educational attainment, it is recommended that developers consider training specialized LLMs based on datasets with good readability [<xref ref-type="bibr" rid="ref42">42</xref>]. Biomedical text can be simplified through hyperparameter substitution techniques, improving patient understanding [<xref ref-type="bibr" rid="ref43">43</xref>]. In addition, structured prompting can also contribute to enhancing readability [<xref ref-type="bibr" rid="ref44">44</xref>].</p><p>Moreover, our study also revealed the existence of hallucination, where GPT-4 cited nonexistent references or websites. Fabricated references not only mislead readers and distort their understanding of keloid or scar but also&#x2014;given the presence of numerous seemingly authoritative yet false information sources&#x2014;may lead patients to overtrust the content generated by GPT-4 [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Given the presence of hallucinations, specific clinical diagnosis and treatment must rely on clinicians; LLMs can only serve as auxiliary tools. To address the hallucination issue in LLMs, it is recommended that developers effectively apply retrieval-augmented generation to retrieve documents from an external corpus (such as academic library systems), as this can significantly reduce the hallucinations [<xref ref-type="bibr" rid="ref47">47</xref>-<xref ref-type="bibr" rid="ref49">49</xref>]. Integrating external, structured knowledge sources (such as knowledge graphs, databases, or other domain-specific resources) into LLMs can also help ensure that LLMs produce responses with fewer hallucinations [<xref ref-type="bibr" rid="ref50">50</xref>]. Furthermore, prompt engineering can mitigate hallucination by improving the reasoning capabilities [<xref ref-type="bibr" rid="ref51">51</xref>].</p><p>GPT-4 can provide comprehensive and generally accurate information, which can further assist patients with keloids or scars in accessing timely and precise information. However, current LLMs exhibit limitations, such as hallucinations and relatively low readability; therefore, they are not recommended as the sole source of information for patients. Limited by the lack of clinical background in current LLMs; the insufficient ability to process audio, image, and video information; limited ability to access academic libraries; and the noninterpretability of black box algorithms, current LLMs still require further development to be adapted for applications in health care [<xref ref-type="bibr" rid="ref52">52</xref>]. The AI agent, as a promising approach, can extend the capabilities of LLMs by enabling them to use external tools, plan and execute multistep tasks, as well as interact dynamically [<xref ref-type="bibr" rid="ref53">53</xref>]. Multimodal LLM is promising to process text (eg, clinical notes and user-input questions), medical images (eg, photos and computed tomography scans), and videos (eg, treatment procedures) provided by patients, which will more effectively assist patients and health care providers in clinical practice about keloid and scar management [<xref ref-type="bibr" rid="ref54">54</xref>].</p></sec><sec id="s4-2"><title>Limitations</title><p>Most of the questions collected from Reddit were posts from patients who had not yet sought medical care. Consequently, the questions posed may be biased toward pretreatment information needs, as fewer questions were reported during the treatment phase. This may compromise the generalizability of GPT-4&#x2019;s evaluation across different patient care stages. In addition, Reddit users are concentrated in the age group of 18 to 49 years, with an average age of 23 years, and the majority are aged under 30 years. Thus, the data collected from Reddit clearly fails to represent the middle-aged and older population [<xref ref-type="bibr" rid="ref55">55</xref>]. Relying solely on Reddit posts for data collection introduces demographic selection bias.</p><p>In terms of assessment tools, the qualitative assessment conducted by experienced plastic surgeons was inherently at risk of bias, given the surgeons&#x2019; attitudes toward the use of GPT-4. Nevertheless, they provided valuable insights owing to their in-depth understanding of scar and keloid education materials. Furthermore, exploratory assessment tools (DISCERN-AI, PEMAT-AI, and NLAT-AI) were used in this study, while their validity requires further testing. LLMs differ from traditional printed educational materials in that their responses to repeated queries of the same question are generated instantaneously and may vary. Currently, existing assessment tools lack the ability to detect such variability in LLM outputs when the same question is posed multiple times [<xref ref-type="bibr" rid="ref56">56</xref>]. Furthermore, content generated by LLMs is often conveyed with excessive certainty, as these models lack the ability to accurately express information involving uncertainties. Providing definitive answers to such uncertain content may mislead patients, yet current assessment scales fail to evaluate this critical limitation [<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref58">58</xref>]. Further research is needed to develop specific tools to enable more robust evaluation of LLM output quality.</p></sec><sec id="s4-3"><title>Conclusions</title><p>Our analysis found that GPT-4 provided high-quality responses to real-world questions related to scars and keloids, suggesting its potential as a useful patient education tool in scar and keloid treatment. The GPT-4 outputs were generally reliable and accurate but need improvement, primarily in readability and hallucinations.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This study is funded by the Talent Fund of Guizhou Provincial People's Hospital (awarded to MR; [2023]-30).</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">GQS</term><def><p>Global Quality Scale</p></def></def-item><def-item><term id="abb3">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">NLAT-AI</term><def><p>Natural Language Assessment Tool for Artificial Intelligence</p></def></def-item><def-item><term id="abb6">PEMAT</term><def><p>Patient Education Materials Assessment Tool</p></def></def-item><def-item><term id="abb7">SMOG</term><def><p>Simple Measure of Gobbledygook</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stoica</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Grumezescu</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Hermenean</surname><given-names>AO</given-names> </name><name name-style="western"><surname>Andronescu</surname><given-names>E</given-names> </name><name name-style="western"><surname>Vasile</surname><given-names>BS</given-names> </name></person-group><article-title>Scar-free healing: current concepts and future perspectives</article-title><source>Nanomaterials (Basel)</source><year>2020</year><month>10</month><day>31</day><volume>10</volume><issue>11</issue><fpage>2179</fpage><pub-id pub-id-type="doi">10.3390/nano10112179</pub-id><pub-id pub-id-type="medline">33142891</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Risk factors for hypertrophic burn scar pain, pruritus, and paresthesia development</article-title><source>Wound Repair Regen</source><year>2018</year><month>03</month><volume>26</volume><issue>2</issue><fpage>172</fpage><lpage>181</lpage><pub-id pub-id-type="doi">10.1111/wrr.12637</pub-id><pub-id pub-id-type="medline">29719102</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hsieh</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Maisel-Campbell</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Zielinski</surname><given-names>E</given-names> </name><name name-style="western"><surname>Galiano</surname><given-names>RD</given-names> </name></person-group><article-title>Daily quality-of-life impact of scars: an interview-based foundational study of patient-reported themes</article-title><source>Plast Reconstr Surg Glob Open</source><year>2021</year><month>04</month><volume>9</volume><issue>4</issue><fpage>e3522</fpage><pub-id pub-id-type="doi">10.1097/GOX.0000000000003522</pub-id><pub-id pub-id-type="medline">33868874</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schouten</surname><given-names>H</given-names> </name><name name-style="western"><surname>Nieuwenhuis</surname><given-names>M</given-names> </name><name name-style="western"><surname>van der Schans</surname><given-names>C</given-names> </name><name name-style="western"><surname>Niemeijer</surname><given-names>A</given-names> </name><name name-style="western"><surname>van Zuijlen</surname><given-names>P</given-names> </name></person-group><article-title>Considerations in determining the severity of burn scar contractures with focus on the knee joint</article-title><source>J Burn Care Res</source><year>2023</year><month>07</month><day>5</day><volume>44</volume><issue>4</issue><fpage>810</fpage><lpage>816</lpage><pub-id pub-id-type="doi">10.1093/jbcr/irad016</pub-id><pub-id pub-id-type="medline">36752774</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Woodward</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Matthews</surname><given-names>CA</given-names> </name></person-group><article-title>Outcomes of revision perineoplasty for persistent postpartum dyspareunia</article-title><source>Female Pelvic Med Reconstr Surg</source><year>2010</year><month>03</month><volume>16</volume><issue>2</issue><fpage>135</fpage><lpage>139</lpage><pub-id pub-id-type="doi">10.1097/SPV.0b013e3181cc8702</pub-id><pub-id pub-id-type="medline">22453161</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waibel</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Waibel</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sedaghat</surname><given-names>E</given-names> </name></person-group><article-title>Scar therapy of skin</article-title><source>Facial Plast Surg Clin North Am</source><year>2023</year><month>11</month><volume>31</volume><issue>4</issue><fpage>453</fpage><lpage>462</lpage><pub-id pub-id-type="doi">10.1016/j.fsc.2023.06.005</pub-id><pub-id pub-id-type="medline">37806679</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gomolin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Cline</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ginsberg</surname><given-names>D</given-names> </name><name name-style="western"><surname>Safai</surname><given-names>B</given-names> </name></person-group><article-title>Scar tissue I wish you saw: patient expectations regarding scar treatment</article-title><source>J Cosmet Dermatol</source><year>2021</year><month>09</month><volume>20</volume><issue>9</issue><fpage>2739</fpage><lpage>2742</lpage><pub-id pub-id-type="doi">10.1111/jocd.13945</pub-id><pub-id pub-id-type="medline">33434326</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cho</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Ryu</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SJ</given-names> </name><etal/></person-group><article-title>Scar characteristics and treatment expectations: a survey of 589 patients</article-title><source>J Cosmet Laser Ther</source><year>2009</year><month>12</month><volume>11</volume><issue>4</issue><fpage>224</fpage><lpage>228</lpage><pub-id pub-id-type="doi">10.3109/14764170903341723</pub-id><pub-id pub-id-type="medline">19951193</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Andrews</surname><given-names>N</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Moiemen</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Below the surface: parents&#x2019; views on the factors that influence treatment adherence in paediatric burn scar management - a qualitative study</article-title><source>Burns</source><year>2018</year><month>05</month><volume>44</volume><issue>3</issue><fpage>626</fpage><lpage>635</lpage><pub-id pub-id-type="doi">10.1016/j.burns.2017.09.003</pub-id><pub-id pub-id-type="medline">29031888</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Beaudoin</surname><given-names>CE</given-names> </name></person-group><article-title>Health literacy and the internet: an exploratory study on the 2013 HINTS survey</article-title><source>Comput Human Behav</source><year>2016</year><month>05</month><volume>58</volume><fpage>240</fpage><lpage>248</lpage><pub-id pub-id-type="doi">10.1016/j.chb.2016.01.007</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Battineni</surname><given-names>G</given-names> </name><name name-style="western"><surname>Baldoni</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chintalapudi</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Factors affecting the quality and reliability of online health information</article-title><source>Digit Health</source><year>2020</year><volume>6</volume><fpage>2055207620948996</fpage><pub-id pub-id-type="doi">10.1177/2055207620948996</pub-id><pub-id pub-id-type="medline">32944269</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khaleel</surname><given-names>I</given-names> </name><name name-style="western"><surname>Wimmer</surname><given-names>BC</given-names> </name><name name-style="western"><surname>Peterson</surname><given-names>GM</given-names> </name><etal/></person-group><article-title>Health information overload among health consumers: a scoping review</article-title><source>Patient Educ Couns</source><year>2020</year><month>01</month><volume>103</volume><issue>1</issue><fpage>15</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1016/j.pec.2019.08.008</pub-id><pub-id pub-id-type="medline">31451363</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bohacek</surname><given-names>L</given-names> </name><name name-style="western"><surname>Gomez</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fish</surname><given-names>JS</given-names> </name></person-group><article-title>An evaluation of internet sites for burn scar management</article-title><source>J Burn Care Rehabil</source><year>2003</year><volume>24</volume><issue>4</issue><fpage>246</fpage><lpage>251</lpage><pub-id pub-id-type="doi">10.1097/01.BCR.0000075844.04297.D9</pub-id><pub-id pub-id-type="medline">14501424</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dada</surname><given-names>A</given-names> </name><name name-style="western"><surname>Puladi</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kleesiek</surname><given-names>J</given-names> </name><name name-style="western"><surname>Egger</surname><given-names>J</given-names> </name></person-group><article-title>ChatGPT in healthcare: a taxonomy and systematic review</article-title><source>Comput Methods Programs Biomed</source><year>2024</year><month>03</month><volume>245</volume><fpage>108013</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2024.108013</pub-id><pub-id pub-id-type="medline">38262126</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Neha</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bhati</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shukla</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Amiruzzaman</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT: transforming healthcare with AI</article-title><source>AI</source><year>2024</year><volume>5</volume><issue>4</issue><fpage>2618</fpage><lpage>2650</lpage><pub-id pub-id-type="doi">10.3390/ai5040126</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jowsey</surname><given-names>T</given-names> </name><name name-style="western"><surname>Stokes-Parish</surname><given-names>J</given-names> </name><name name-style="western"><surname>Singleton</surname><given-names>R</given-names> </name><name name-style="western"><surname>Todorovic</surname><given-names>M</given-names> </name></person-group><article-title>Medical education empowered by generative artificial intelligence large language models</article-title><source>Trends Mol Med</source><year>2023</year><month>12</month><volume>29</volume><issue>12</issue><fpage>971</fpage><lpage>973</lpage><pub-id pub-id-type="doi">10.1016/j.molmed.2023.08.012</pub-id><pub-id pub-id-type="medline">37718142</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iqbal</surname><given-names>U</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>LTJ</given-names> </name><name name-style="western"><surname>Rahmanti</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Celi</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Li</surname><given-names>YCJ</given-names> </name></person-group><article-title>Can large language models provide secondary reliable opinion on treatment options for dermatological diseases?</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>05</month><day>20</day><volume>31</volume><issue>6</issue><fpage>1341</fpage><lpage>1347</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae067</pub-id><pub-id pub-id-type="medline">38578616</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>R</given-names> </name></person-group><source>Close encounters of the AI kind: the increasingly human-like way people are engaging with language models</source><access-date>2025-11-26</access-date><publisher-name>Elon University</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://imaginingthedigitalfuture.org/reports-and-publications/close-encounters-of-the-ai-kind/close-encounters-of-the-ai-kind-main-report">https://imaginingthedigitalfuture.org/reports-and-publications/close-encounters-of-the-ai-kind/close-encounters-of-the-ai-kind-main-report</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gencer</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gencer</surname><given-names>K</given-names> </name></person-group><article-title>Large language models in healthcare: a bibliometric analysis and examination of research trends</article-title><source>J Multidiscip Healthc</source><year>2025</year><volume>18</volume><fpage>223</fpage><lpage>238</lpage><pub-id pub-id-type="doi">10.2147/JMDH.S502351</pub-id><pub-id pub-id-type="medline">39844924</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sood</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Rawat</surname><given-names>KS</given-names> </name></person-group><article-title>Empowering elderly care with intelligent IoT-driven smart toilets for home-based infectious health monitoring</article-title><source>Artif Intell Med</source><year>2023</year><month>10</month><volume>144</volume><fpage>102666</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2023.102666</pub-id><pub-id pub-id-type="medline">37783534</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rawat</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Sood</surname><given-names>SK</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Shukla</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Thakur</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Arabkoohsar</surname><given-names>A</given-names> </name></person-group><article-title>Revolution of artificial intelligence and IOT in healthcare: a keyword co-occurrence network analysis using CiteSpace</article-title><source>Recent Advances in Mechanical Engineering</source><year>2023</year><fpage>231</fpage><lpage>237</lpage><pub-id pub-id-type="doi">10.1007/978-981-99-2349-6_20</pub-id><pub-id pub-id-type="other">9789819923496</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jagdeo</surname><given-names>J</given-names> </name></person-group><article-title>An analysis of keloid patient questions on Reddit</article-title><source>Wound Repair Regen</source><year>2024</year><volume>32</volume><issue>2</issue><fpage>164</fpage><lpage>170</lpage><pub-id pub-id-type="doi">10.1111/wrr.13160</pub-id><pub-id pub-id-type="medline">38372454</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Campbell</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Estephan</surname><given-names>LE</given-names> </name><name name-style="western"><surname>Sina</surname><given-names>EM</given-names> </name><etal/></person-group><article-title>Evaluating ChatGPT responses on thyroid nodules for patient education</article-title><source>Thyroid</source><year>2024</year><month>03</month><volume>34</volume><issue>3</issue><fpage>371</fpage><lpage>377</lpage><pub-id pub-id-type="doi">10.1089/thy.2023.0491</pub-id><pub-id pub-id-type="medline">38010917</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gibson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Jackson</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shanmugasundaram</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluating the efficacy of ChatGPT as a patient education tool in prostate cancer: multimetric assessment</article-title><source>J Med Internet Res</source><year>2024</year><month>08</month><day>14</day><volume>26</volume><fpage>e55939</fpage><pub-id pub-id-type="doi">10.2196/55939</pub-id><pub-id pub-id-type="medline">39141904</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cassidy</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Baker</surname><given-names>JF</given-names> </name></person-group><article-title>Orthopaedic patient information on the world wide web: an essential review</article-title><source>J Bone Joint Surg Am</source><year>2016</year><month>02</month><day>17</day><volume>98</volume><issue>4</issue><fpage>325</fpage><lpage>338</lpage><pub-id pub-id-type="doi">10.2106/JBJS.N.01189</pub-id><pub-id pub-id-type="medline">26888683</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><article-title>Readable</article-title><source>Our Readability Checker helps you to communicate clearly</source><access-date>2026-02-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://readable.com">https://readable.com</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkaissi</surname><given-names>H</given-names> </name><name name-style="western"><surname>McFarlane</surname><given-names>SI</given-names> </name></person-group><article-title>Artificial hallucinations in ChatGPT: implications in scientific writing</article-title><source>Cureus</source><year>2023</year><month>02</month><volume>15</volume><issue>2</issue><fpage>e35179</fpage><pub-id pub-id-type="doi">10.7759/cureus.35179</pub-id><pub-id pub-id-type="medline">36811129</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>AlGhamdi</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Almohideb</surname><given-names>MA</given-names> </name></person-group><article-title>Internet use by dermatology outpatients to search for health information</article-title><source>Int J Dermatol</source><year>2011</year><month>03</month><volume>50</volume><issue>3</issue><fpage>292</fpage><lpage>299</lpage><pub-id pub-id-type="doi">10.1111/j.1365-4632.2010.04705.x</pub-id><pub-id pub-id-type="medline">21342162</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name></person-group><article-title>Consumers&#x2019; evaluation of web-based health information quality: meta-analysis</article-title><source>J Med Internet Res</source><year>2022</year><month>04</month><day>28</day><volume>24</volume><issue>4</issue><fpage>e36463</fpage><pub-id pub-id-type="doi">10.2196/36463</pub-id><pub-id pub-id-type="medline">35482390</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Monib</surname><given-names>WK</given-names> </name><name name-style="western"><surname>Qazi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mahmud</surname><given-names>MM</given-names> </name></person-group><article-title>Exploring learners&#x2019; experiences and perceptions of ChatGPT as a learning tool in higher education</article-title><source>Educ Inf Technol</source><year>2025</year><month>01</month><volume>30</volume><issue>1</issue><fpage>917</fpage><lpage>939</lpage><pub-id pub-id-type="doi">10.1007/s10639-024-13065-4</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>B</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>W</given-names> </name></person-group><article-title>ChatGPT and mycosis- a new weapon in the knowledge battlefield</article-title><source>BMC Infect Dis</source><year>2023</year><month>10</month><day>27</day><volume>23</volume><issue>1</issue><fpage>731</fpage><pub-id pub-id-type="doi">10.1186/s12879-023-08724-9</pub-id><pub-id pub-id-type="medline">37891532</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sanli</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Tekcan Sanli</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Karabulut</surname><given-names>A</given-names> </name></person-group><article-title>Can American Board of Surgery in Training Examinations be passed by large language models? Comparative assessment of Gemini, Copilot, and ChatGPT</article-title><source>Am Surg</source><year>2025</year><month>11</month><volume>91</volume><issue>11</issue><fpage>1923</fpage><lpage>1929</lpage><pub-id pub-id-type="doi">10.1177/00031348251341956</pub-id><pub-id pub-id-type="medline">40353502</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gill</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Moxam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sanghvi</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>S</given-names> </name></person-group><article-title>Comparison of Gemini Advanced and ChatGPT 4.0&#x2019;s performances on the Ophthalmology Resident Ophthalmic Knowledge Assessment Program (OKAP) examination review question banks</article-title><source>Cureus</source><year>2024</year><month>09</month><volume>16</volume><issue>9</issue><fpage>e69612</fpage><pub-id pub-id-type="doi">10.7759/cureus.69612</pub-id><pub-id pub-id-type="medline">39421095</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>YZ</given-names> </name><name name-style="western"><surname>Nah</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Saw</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Rajandram</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>TA</given-names> </name></person-group><article-title>Evaluating the performance of artificial intelligence chatbots in answering urology questions derived from guidelines or board examinations: a systematic review</article-title><source>Urol Sci</source><year>2025</year><fpage>10</fpage><pub-id pub-id-type="doi">10.1097/us9.0000000000000089</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>B</given-names> </name><name name-style="western"><surname>Seth</surname><given-names>I</given-names> </name><name name-style="western"><surname>Kah</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Using generative artificial intelligence tools in cosmetic surgery: a study on rhinoplasty, facelifts, and blepharoplasty procedures</article-title><source>J Clin Med</source><year>2023</year><month>10</month><day>14</day><volume>12</volume><issue>20</issue><fpage>6524</fpage><pub-id pub-id-type="doi">10.3390/jcm12206524</pub-id><pub-id pub-id-type="medline">37892665</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Incerti Parenti</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bartolucci</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Biondi</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Online patient education in obstructive sleep apnea: ChatGPT versus Google search</article-title><source>Healthcare (Basel)</source><year>2024</year><month>09</month><day>5</day><volume>12</volume><issue>17</issue><fpage>1781</fpage><pub-id pub-id-type="doi">10.3390/healthcare12171781</pub-id><pub-id pub-id-type="medline">39273804</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Toiv</surname><given-names>A</given-names> </name><name name-style="western"><surname>Saleh</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ishak</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Digesting digital health: a study of appropriateness and readability of ChatGPT-generated gastroenterological information</article-title><source>Clin Transl Gastroenterol</source><year>2024</year><month>11</month><day>1</day><volume>15</volume><issue>11</issue><fpage>e00765</fpage><pub-id pub-id-type="doi">10.14309/ctg.0000000000000765</pub-id><pub-id pub-id-type="medline">39212302</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Veronique</surname><given-names>I</given-names> </name></person-group><article-title>Report on the condition of education 2024</article-title><year>2024</year><publisher-name>Institute of Education Sciences</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://nces.ed.gov/pubs2024/2024144.pdf">https://nces.ed.gov/pubs2024/2024144.pdf</ext-link></comment></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oei</surname><given-names>F</given-names> </name><name name-style="western"><surname>Putra</surname><given-names>IB</given-names> </name><name name-style="western"><surname>Jusuf</surname><given-names>NK</given-names> </name></person-group><article-title>The relationship between skin color and keloid</article-title><source>Bali Med J</source><year>2021</year><volume>10</volume><issue>2</issue><fpage>835</fpage><lpage>838</lpage><pub-id pub-id-type="doi">10.15562/bmj.v10i2.2619</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>DeTemple</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Meine</surname><given-names>TC</given-names> </name></person-group><article-title>Comparison of the readability of ChatGPT and Bard in medical communication: a meta-analysis</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>09</month><day>1</day><volume>25</volume><issue>1</issue><fpage>325</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-03035-2</pub-id><pub-id pub-id-type="medline">40890707</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Whittaker</surname><given-names>P</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>M</given-names> </name></person-group><article-title>Quality and readability of chatbot responses to patient questions: a systematic cross-sectional meta-synthesis</article-title><source>Health Informatics J</source><year>2025</year><volume>31</volume><issue>4</issue><fpage>14604582251388879</fpage><pub-id pub-id-type="doi">10.1177/14604582251388879</pub-id><pub-id pub-id-type="medline">41106853</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>From quantity to quality: boosting LLM performance with self-guided data selection for instruction tuning</article-title><conf-name>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics</conf-name><conf-date>Jun 16-21, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><fpage>7602</fpage><lpage>7635</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.naacl-long.421</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Swanson</surname><given-names>K</given-names> </name><name name-style="western"><surname>He</surname><given-names>S</given-names> </name><name name-style="western"><surname>Calvano</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Biomedical text readability after hypernym substitution with fine-tuned large language models</article-title><source>PLOS Digit Health</source><year>2024</year><month>04</month><volume>3</volume><issue>4</issue><fpage>e0000489</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000489</pub-id><pub-id pub-id-type="medline">38625843</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Daulat</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Dholaria</surname><given-names>N</given-names> </name><name name-style="western"><surname>Burnet</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Prompt engineering and follow-up questioning improves the readability of spine surgery questions in large language models</article-title><source>World Neurosurg</source><year>2025</year><month>11</month><volume>203</volume><fpage>124423</fpage><pub-id pub-id-type="doi">10.1016/j.wneu.2025.124423</pub-id><pub-id pub-id-type="medline">40889596</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Aljohani</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kommu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name></person-group><article-title>A comprehensive survey on the trustworthiness of large language models in healthcare</article-title><conf-name>Proceedings of the 2025 Findings of the Association for Computational Linguistics EMNLP&#x2019;25</conf-name><conf-date>Nov 4-9, 2025</conf-date><conf-loc>Suzhou, China</conf-loc><fpage>6720</fpage><lpage>6748</lpage><pub-id pub-id-type="doi">10.18653/v1/2025.findings-emnlp.356</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Medical hallucination in foundation models and their impact on healthcare</article-title><source>Health Systems and Quality Improvement</source><comment>Preprint posted online on 2025</comment><pub-id pub-id-type="doi">10.1101/2025.02.28.25323115</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name></person-group><article-title>Hallucination mitigation for retrieval-augmented large language models: a review</article-title><source>Mathematics</source><year>2025</year><volume>13</volume><issue>5</issue><fpage>856</fpage><pub-id pub-id-type="doi">10.3390/math13050856</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhattacharya</surname><given-names>R</given-names> </name></person-group><article-title>Strategies to mitigate hallucinations in large language models</article-title><source>AMA</source><year>2024</year><volume>10</volume><issue>1</issue><fpage>62</fpage><pub-id pub-id-type="doi">10.69554/NXXB8234</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wo&#x0142;k</surname><given-names>K</given-names> </name></person-group><article-title>Evaluating retrieval-augmented generation variants for clinical decision support: hallucination mitigation and secure on-premises deployment</article-title><source>Electronics (Basel)</source><year>2025</year><volume>14</volume><issue>21</issue><fpage>4227</fpage><pub-id pub-id-type="doi">10.3390/electronics14214227</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Agrawal</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kumarage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Alghamdi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name></person-group><article-title>Can knowledge graphs reduce hallucinations in LLMs?: a survey</article-title><conf-name>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics</conf-name><conf-date>Jun 16-21, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><fpage>3947</fpage><lpage>3960</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.naacl-long.219</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>C</given-names> </name></person-group><article-title>Mitigating spatial hallucination in large language models for path planning via prompt engineering</article-title><source>Sci Rep</source><year>2025</year><volume>15</volume><issue>1</issue><fpage>8881</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-93601-5</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Busch</surname><given-names>F</given-names> </name><name name-style="western"><surname>Hoffmann</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rueger</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Current applications and challenges in large language models for patient care: a systematic review</article-title><source>Commun Med (Lond)</source><year>2025</year><month>01</month><day>21</day><volume>5</volume><issue>1</issue><fpage>26</fpage><pub-id pub-id-type="doi">10.1038/s43856-024-00717-2</pub-id><pub-id pub-id-type="medline">39838160</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ferrag</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Tihanyi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Debbah</surname><given-names>M</given-names> </name></person-group><article-title>From LLM reasoning to autonomous ai agents: a comprehensive review</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 28, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2504.19678</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>AlSaad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Abd-Alrazaq</surname><given-names>A</given-names> </name><name name-style="western"><surname>Boughorbel</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Multimodal large language models in health care: applications, challenges, and future outlook</article-title><source>J Med Internet Res</source><year>2024</year><month>09</month><day>25</day><volume>26</volume><fpage>e59505</fpage><pub-id pub-id-type="doi">10.2196/59505</pub-id><pub-id pub-id-type="medline">39321458</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="web"><article-title>Reddit user age, gender, &#x0026; demographics</article-title><source>Exploding Topics</source><year>2025</year><access-date>2025-11-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://explodingtopics.com/blog/reddit-users">https://explodingtopics.com/blog/reddit-users</ext-link></comment></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Calloway</surname><given-names>C</given-names> </name></person-group><source>Why do different LLMs give different answers to the same question? Model uncertainty and variability in LLM-based intrusion detection systems ranking</source><year>2025</year><access-date>2024-05-20</access-date><publisher-name>Norfolk State University</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://digitalcommons.odu.edu/cgi/viewcontent.cgi?article=1123&#x0026;context=covacci-undergraduateresearch">https://digitalcommons.odu.edu/cgi/viewcontent.cgi?article=1123&#x0026;context=covacci-undergraduateresearch</ext-link></comment></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yona</surname><given-names>G</given-names> </name><name name-style="western"><surname>Aharoni</surname><given-names>R</given-names> </name><name name-style="western"><surname>Geva</surname><given-names>M</given-names> </name></person-group><article-title>Can large language models faithfully express their intrinsic uncertainty in words?</article-title><conf-name>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 12-16, 2024</conf-date><conf-loc>Miami, FL</conf-loc><fpage>632</fpage><lpage>702</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-main.443</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Fu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Conde</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez</surname><given-names>G</given-names> </name><name name-style="western"><surname>Grandury</surname><given-names>M</given-names> </name><name name-style="western"><surname>Reviriego</surname><given-names>P</given-names> </name></person-group><article-title>Multiple choice questions: reasoning makes large language models (LLMS) more self-confident even when they are wrong</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 16, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.09775</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Patient Education Materials Assessment Tool for Artificial Intelligence for evaluating the understandability (8 items) and actionability (3 items) of artificial intelligence&#x2013;generated patient education text.</p><media xlink:href="medinform_v14i1e78838_app1.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>DISCERN-AI tool (7 core items) for assessing artificial intelligence&#x2013;generated treatment information quality, with 1-3 scoring for each item (relevance, source clarity, date transparency, balance, additional support, uncertainty acknowledgment, and overall quality).</p><media xlink:href="medinform_v14i1e78838_app2.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Natural Language Assessment Tool for Artificial Intelligence assessment framework: 5 domains (accuracy, safety, appropriateness, actionability, and effectiveness).</p><media xlink:href="medinform_v14i1e78838_app3.docx" xlink:title="DOCX File, 24 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Reference Evaluation for AI assessment criteria: 3-item tool for validating artificial intelligence&#x2013;generated references (real or fabricated, content support, and authoritative source status) on large language model reference hallucinations.</p><media xlink:href="medinform_v14i1e78838_app4.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Supplementary tables for health care website&#x2013;derived scar or keloid questions: includes 38 unique questions from 3 medical websites (Table S1), intraclass correlation coefficient values for assessment tools (Table S2), and artificial intelligence output evaluation scores (Patient Education Materials Assessment Tool for Artificial Intelligence, DISCERN-AI, Global Quality Scale, Natural Language Assessment Tool for Artificial Intelligence, readability, and reference quality) for website questions (Tables S3-S6).</p><media xlink:href="medinform_v14i1e78838_app5.docx" xlink:title="DOCX File, 23 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Supplementary tables for Reddit-derived scar or keloid questions: includes subcategory-specific artificial intelligence output evaluation scores for all 16 question groups (Patient Education Materials Assessment Tool for Artificial Intelligence, DISCERN-AI, or Global Quality Scale: Table S1; Natural Language Assessment Tool for Artificial Intelligence: Table S2; and readability metrics: Table S3) and subcategory-specific reference evaluation results (Reference Evaluation for AI) for 3250 total cited references (Table S4), plus overall aggregate scores for all assessments.</p><media xlink:href="medinform_v14i1e78838_app6.docx" xlink:title="DOCX File, 25 KB"/></supplementary-material></app-group></back></article>