<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e86474</article-id><article-id pub-id-type="doi">10.2196/86474</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Quality of Clinical Notes Created by Ambient Listening Generative AI: Pragmatic Prospective Pilot Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Taylor</surname><given-names>Sandra L</given-names></name><degrees>PhD, MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jost</surname><given-names>Melissa</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>MacDonald</surname><given-names>Scott</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ren</surname><given-names>Yunyi</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hilton</surname><given-names>Shelley</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Davenport</surname><given-names>Sadie</given-names></name><degrees>MLIS</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Aizenberg</surname><given-names>Debbie</given-names></name><degrees>MD, MBA</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hall</surname><given-names>Bruce</given-names></name><degrees>MD, MBA, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lyles</surname><given-names>Courtney R</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Adams</surname><given-names>Jason Y</given-names></name><degrees>MS, MD</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Public Health Sciences, School of Medicine, University of California, Davis</institution><addr-line>4480 2nd Avenue, Suite 4152</addr-line><addr-line>Sacramento</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Clinical and Translational Science Center, University of California, Davis</institution><addr-line>Sacramento</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Clinical Informatics, University of California, Davis</institution><addr-line>Sacramento</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff4"><institution>IT Clinical Applications, University of California, Davis</institution><addr-line>Sacramento</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff5"><institution>Blaisdell Medical Library, University of California, Davis</institution><addr-line>Sacramento</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Otolaryngology, Head and Neck Surgery, School of Medicine, University of California, Davis</institution><addr-line>Sacramento</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff7"><institution>Department of Internal Medicine, School of Medicine, University of California, Davis</institution><addr-line>Sacramento</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff8"><institution>IT Enterprise Analytics and Data Services, University of California, Davis</institution><addr-line>Sacramento</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Benis</surname><given-names>Arriel</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lisi</surname><given-names>Anthony J</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chartash</surname><given-names>David</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Sandra L Taylor, PhD, MS, Department of Public Health Sciences, School of Medicine, University of California, Davis, 4480 2nd Avenue, Suite 4152, Sacramento, CA, 95817, United States, 1 916-734-4800; <email>sltaylor@health.ucdavis.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>17</day><month>4</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e86474</elocation-id><history><date date-type="received"><day>24</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>29</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>06</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Sandra L Taylor, Melissa Jost, Scott MacDonald, Yunyi Ren, Shelley Hilton, Sadie Davenport, Debbie Aizenberg, Bruce Hall, Courtney R Lyles, Jason Y Adams. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 17.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e86474"/><abstract><sec><title>Background</title><p>Physicians routinely document specifics of patient encounters in clinic visit notes, a critical but potentially time-consuming task. Ambient listening artificial intelligence (AI) technology is being integrated into clinical workflows to reduce documentation burden by creating draft visit notes. While this technology is promising, it is not perfect, and the potential for patient harm needs to be understood and mitigated. We developed and piloted an efficient, standardized approach to evaluating AI-generated notes for safety concerns in ambulatory care visits.</p></sec><sec><title>Objective</title><p>The objective of this quality improvement project was to develop and pilot an efficient, standardized, and scalable approach to evaluating AI-generated notes for safety concerns in ambulatory care visits.</p></sec><sec sec-type="methods"><title>Methods</title><p>During a 2-month pilot (July to August 2024), 31 physicians across multiple specialties used an ambient listening AI scribe to assist with the creation of 7545 clinic notes. A novel survey instrument was developed to assess note quality, focusing on 4 error types: accidental inclusions, accidental omissions, hallucinations, and bias. Physicians evaluated 356 (4.7%) AI-generated notes. Where an error was present, physicians rated its severity based on its potential to cause patient harm if it was not corrected, on a 0 to 5 scale. Additionally, a vendor-reported metric on the percentage of note content edited by physicians was analyzed.</p></sec><sec sec-type="results"><title>Results</title><p>Of the 356 evaluated notes, accidental omissions were the most frequent error (n=64, 18%), followed by hallucinations (n=41, 11.5%), and accidental inclusions (n=33, 9.3%). Bias was rare (n=4, 1.1%). Most (119/142, 83.8%) errors were rated as mild to moderate (severity 1&#x2010;3), with only 19 (5.3%) notes containing errors rated as posing serious or imminent risk (severity 4&#x2010;5). Editing metrics across all AI-created notes showed a median of 9.0% (IQR 2.5%-21.9%) of AI-generated words were changed, with 14.9% (143/960) of notes left entirely unedited. Physician editing practices varied widely, with average percentages of AI-generated words changed ranging from 1.9% to 69.3% (median 9.0%, IQR 2.5%-21.9%).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>AI-generated clinical notes were generally of high quality, with 94.7% (337/356) free from significant errors. However, because a small number contained errors that carried the risk of serious harm if not corrected, careful clinician review of notes remains imperative. Prior to deploying an AI scribe, organizations should pilot the technology and include an efficient review process to understand the nature and type of errors common at their organization. This pilot provides a scalable model for other health systems seeking to implement AI scribe technology responsibly.</p></sec></abstract><kwd-group><kwd>generative artificial intelligence</kwd><kwd>generative AI</kwd><kwd>artificial intelligence scribe</kwd><kwd>AI scribe</kwd><kwd>clinical documentation</kwd><kwd>ambient listening</kwd><kwd>quality of care</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Generative artificial intelligence (AI) is infiltrating and transforming many aspects of medicine, with specific emphasis on clinician-based tasks such as note creation and after-visit summaries, drafting responses to patient messages, and summarizing patient charts&#x2014;all of which could reduce the clinical workload [<xref ref-type="bibr" rid="ref1">1</xref>]. The specific uptake of AI scribes for note creation has some of the highest rates of implementation in the United States. These AI scribes use ambient listening generative AI technology to generate draft notes [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>], capturing patient-physician conversations during encounters and converting them to documentation that is reviewed, modified, and approved by the physician. To date, the literature related to AI scribes has signaled improvements in time per note for physicians, with a potential for reducing workload and burnout [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>While capitalizing on the potential benefits of AI scribes, the potential for patient harm also needs to be understood and mitigated. Generative AI is not perfect, and errors embedded in a patient&#x2019;s medical record could impact care [<xref ref-type="bibr" rid="ref6">6</xref>]. Indeed, concerns have been raised about the use of generative AI to create physician responses to patient inquiries [<xref ref-type="bibr" rid="ref7">7</xref>], and more broadly, there is a need for rigorous validation of large language models in clinical practice [<xref ref-type="bibr" rid="ref8">8</xref>]. Real-world validation of generative AI is essential to avoiding patient harm [<xref ref-type="bibr" rid="ref4">4</xref>], but efficient assessment tools that capture the most critical errors are not yet widely available.</p><p>Therefore, as part of a quality improvement pilot program using an ambient listening AI scribe to assist in preparing clinic visit notes, the University of California, Davis Health (UCDH) developed a novel survey instrument to assess the quality of AI-generated notes and identify errors with potential to impact patient care and outcomes. The overall goal of the quality assessment was to ensure patient safety prior to widespread deployment of the AI scribe, with the following specific objectives: develop an efficient, scalable, and standardized approach to assess the quality of AI scribe&#x2013;generated clinical visit notes; identify the type and frequency of errors introduced by the AI scribe; quantify the severity of different errors and potential for patient harm, and develop a long-term monitoring approach.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>The UCDH system is an academic medical center with a 653-bed teaching hospital and the only level I adult and pediatric trauma center in inland Northern California, serving a 33-county region of about 6 million residents with more than 1.9 million patient encounters annually. UCDH conducted an ambulatory care pilot program evaluating use of an AI scribe tool to assist with preparation of clinic visit notes. The pilot program ran for 2 months (July to August 2024), during which the physicians in the pilot had the option of using the ambient listening tool. We first summarized the total number of notes created by the AI scribe technology during the pilot period, overall and by physician, followed by a secondary subanalysis of AI scribe note quality.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was determined by the University of California, Davis institutional review board to not be human subjects research and therefore was exempt from review (2367684&#x2010;1). Because this study was exempt from review and patient information was aggregated, study-specific informed consent and privacy protections were not necessary. Physicians were not compensated for participating in the pilot program.</p></sec><sec id="s2-3"><title>Note Quality Assessment</title><p>All physicians in the pilot agreed to assess the quality of draft notes produced by the AI scribe technology using a novel standardized assessment instrument for a subset of their clinic encounters. To minimize assessment burden, physicians were asked to evaluate 10 draft notes on 2 different days during a 3-week period. This number was selected to represent most notes in their outpatient practice on these selected days, and each clinician received regular reminders during the pilot to complete their assessments. Note types included history and physical notes and progress notes; note type was not captured as part of the assessment.</p><p>Detailed quality ratings by physicians in the pilot were captured using a novel standardized assessment instrument accessible via an online Qualtrics (Qualtrics, LLC) survey (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The survey was developed in collaboration with our institution&#x2019;s AI Oversight Committee and included four components of quality deemed most relevant to patient safety, specifically whether the AI scribe note had the following: (1) accidental inclusions, (2) accidental omissions, (3) hallucinations, or (4) bias in the draft language (<xref ref-type="other" rid="box1">Textbox 1</xref>). These components were informed by the Physician Documentation Quality Instrument (PDQI-9) and known deficiencies in AI and large language model summarization tools [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. On the survey, physicians reported if any of these types of errors were present. If so, the physician rated the severity of the error on a scale of 0 to 5, with 0 representing negligible risk to the patient, physician, or health system and 5 representing potential for serious and imminent risk of harm. Within this guidance, severity ratings were based on individual physicians&#x2019; clinical judgment in the context of the visit. <xref ref-type="table" rid="table1">Table 1</xref> provides examples of errors reported for each severity rating level.</p><boxed-text id="box1"><title> Components of note quality assessment.</title><p><bold>Accidental inclusion</bold></p><list list-type="bullet"><list-item><p>Does the draft note contain inaccurate, real information discussed in the visit that was accidentally included or misattributed by the AI?</p></list-item></list><p><bold>Accidental omission</bold></p><list list-type="bullet"><list-item><p>Does the draft omit information that you would have included in the note if you had written/dictated it without the AI?</p></list-item></list><p><bold>Hallucination</bold></p><list list-type="bullet"><list-item><p>Does the draft note contain inaccurate, undiscussed, hallucinated information that was made up by the AI?</p></list-item></list><p><bold>Bias</bold></p><list list-type="bullet"><list-item><p>Does the note appear to either include or omit information that might increase the risk that vulnerable populations/protected classes of patients are treated unfairly?</p></list-item></list></boxed-text><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Examples of errors in artificial intelligence (AI)&#x2013;generated notes at each severity rating level.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Severity rating</td><td align="left" valign="bottom">Examples of AI documentation mistakes</td></tr></thead><tbody><tr><td align="char" char="." valign="top">0</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Misattributed a medication dosage statement to the parent, although it was part of the clinician&#x2019;s discussion of risks and benefits</p></list-item><list-item><p>Inserted advice on substance abuse that the clinician did not discuss</p></list-item></list></td></tr><tr><td align="char" char="." valign="top">1</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Included irrelevant details (eg, ability to use computer)</p></list-item><list-item><p>Attributed a medication side effect comment to the patient when it was stated by the clinician</p></list-item><list-item><p>Missed noting supplement use and risk-benefit discussion</p></list-item><list-item><p>Missed noting spine clinic appointment (instead wrote referral would be placed)</p></list-item></list></td></tr><tr><td align="char" char="." valign="top">2</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Electrocardiogram results reported as echocardiogram results</p></list-item><list-item><p>Incorrect timeline for COVID-19 exposure</p></list-item><list-item><p>Ovarian adenoma incorrectly documented rather than ovarian thecoma or fibroma</p></list-item><list-item><p>Medication instructions simplified incorrectly (&#x201C;take for one month&#x201D; instead of conditional continuation)</p></list-item><list-item><p>Anxiety medication documented for the patient instead of the spouse</p></list-item><list-item><p>Misinterpretation of recommended blood test timing (&#x201C;have blood drawn in 1 week&#x201D; instead of 1 week before follow-up)</p></list-item></list></td></tr><tr><td align="char" char="." valign="top">3</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Included history of osteoporosis without supporting evidence</p></list-item><list-item><p>Attributed discontinuation of a medication stopped years ago to one stopped yesterday</p></list-item><list-item><p>Incorrectly stated that the patient was on methadone</p></list-item><list-item><p>Missed discussion on home safety and assisted living recommendation</p></list-item><list-item><p>Added the phrase &#x201C;increase dose if symptoms improve,&#x201D; which was contrary to the documented treatment plan</p></list-item></list></td></tr><tr><td align="char" char="." valign="top">4</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Incorrectly added directive to start aspirin for ankle pain, which was not the proposed plan</p></list-item><list-item><p>Incorrectly stated that the patient has diabetes</p></list-item><list-item><p>Did not include the patient&#x2019;s discussion of weight loss, fatigue, and medication change</p></list-item><list-item><p>Improper medication dose documented in plan</p></list-item></list></td></tr><tr><td align="char" char="." valign="top">5</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Attributed a history provided by the patient to an incorrect diagnosis</p></list-item><list-item><p>Incorrectly stated that the patient should continue current metformin dosing</p></list-item><list-item><p>Did not document computed tomography scan discussed by the clinician</p></list-item></list></td></tr></tbody></table></table-wrap><p>Using these surveys, we summarized the proportion of each error type during the pilot. Furthermore, we calculated a quality score as follows. For each component of quality, a score ranging from 0 to 1 was calculated as 1 &#x2013; (0.2 &#x00D7; severity) such that higher severity errors resulted in higher point deductions. The overall quality score was then calculated as the sum of the 4 component scores, yielding a score ranging from 0 (lowest quality) to 4 (highest quality). We also linked these quality ratings to patient characteristics for each note to report on the patient distribution in the sample.</p><p>The time between the clinic visit and the quality assessment review was calculated as the difference between the time that the AI scribe recording ended and when the physician started to review the note. Data for the quality assessment were collected separately from the AI scribe recording time data, which came from the vendor. We were not able to link the quality assessments and AI scribe data for 9 visits, and recording times were not available for 4 of the assessed notes.</p><p>For each note, the vendor calculated a measure of the percentage of the note that was retained, calculated as 1 &#x2013; (number of word-level edits / number of words in the final completed note). Word-level edits are defined as additions, substitutions, and deletions. The vendor provided the average percentage retained for each physician during the pilot period. Due to technical issues, this measure could only be obtained at the note level from the vendor for 960 notes. We converted the percentage retained metric to the percentage of words changed.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>A total of 31 physicians volunteered for the pilot program. Specialties of physicians in the pilot program included family practice (n=12, 38.7%); internal medicine (n=11, 35.5%); otolaryngology (n=3, 9.7%); pediatrics (n=2, 6.5%); and dermatology, obstetrics and gynecology, and endocrinology (n=1 each, 3.2%).</p><sec id="s3-1"><title>Quality Assessment</title><p>Over the pilot period, a total of 7545 notes were generated using the AI scribe by the 31 physicians in the pilot program. There was substantial variability in the number of notes generated across physicians, with a median (IQR) of 169.0 (42.5-384.5) notes, ranging from 10 to 809 (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>(A) Total number of notes generated during pilot period by each physician and (B) average percentage of words changed in artificial intelligence (AI) scribe&#x2013;generated notes by each physician. Due to technical issues, the percentage of words changed was not recorded for some physicians.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86474_fig01.png"/></fig><p>Physicians evaluated the quality of 356 (4.7%) of 7545 notes, with a median number of notes evaluated by a physician of 11.0 (IQR 8.5-13.0). Family medicine and internal medicine physicians evaluated most of the notes, 182 (51.1%) and 115 (32.3%), respectively, as these specialties comprised three-quarters (23/31, 74.2%) of the participating physicians. Entering these data for the quality assessments into Qualtrics required a median of only 38.0 (IQR 18.0-96.2) seconds. Physicians usually conducted the quality assessments on the same day as the clinic visit, yielding a median time between the clinic visit and the quality assessment of 3.9 (IQR 1.6-21.1) hours. Among these 356 notes that were scored by physicians, aggregated patient characteristics are summarized in Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, demonstrating a wide representation among patients seen in the health care system.</p></sec><sec id="s3-2"><title>Error Frequency and Severity</title><p>Among the 356 notes, accidental omissions were the most prevalent quality concern, occurring in 18% (n=64) of the notes, followed by hallucinations (n=41, 11.5%) and accidental inclusions (n=33, 9.3%; <xref ref-type="table" rid="table2">Table 2</xref>). Bias was only reported in 4 notes (1.1%). In general, quality concerns tied to omissions, hallucinations, or accidental inclusions were rated as mild to moderate (severity ratings of 1&#x2010;3). Severity scores of 4 to 5, representing potentially serious or imminent risk of harm if draft errors were not corrected by clinicians, were infrequent&#x2014;with only 7 (2%) draft notes in this range for accidental omissions, 9 (2.5%) for hallucinations, 5 (1.4%) for accidental inclusions, and 2 (0.6%) for bias.</p><p>Finally, the combined note quality score (accounting for severity) was high, with a median quality score of 4.0 (IQR 3.8-4.0). Two-thirds (n=242, 68%) of the notes received a score of 4.0, indicating no quality concerns. A small number of notes (n=11, 3.1%) had an overall score less than 3. Note quality was similar across the specialties (<xref ref-type="table" rid="table3">Table 3</xref>; Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Summary of concern severity ratings for quality assessments of visit notes (N=356).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Severity score</td><td align="left" valign="bottom">Inclusion, n (%)</td><td align="left" valign="bottom">Omission, n (%)</td><td align="left" valign="bottom">Hallucination, n (%)</td><td align="left" valign="bottom">Bias, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">5 (severe)</td><td align="left" valign="top">1 (0.3)</td><td align="left" valign="top">3 (0.8)</td><td align="left" valign="top">3 (0.8)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">4 (1.1)</td><td align="left" valign="top">4 (1.1)</td><td align="left" valign="top">6 (1.7)</td><td align="left" valign="top">2 (0.6)</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">9 (2.5)</td><td align="left" valign="top">12 (3.4)</td><td align="left" valign="top">14 (3.9)</td><td align="left" valign="top">1 (0.3)</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">7 (2.0)</td><td align="left" valign="top">17 (4.8)</td><td align="left" valign="top">10 (2.8)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">1</td><td align="left" valign="top">12 (3.4)</td><td align="left" valign="top">28 (7.9)</td><td align="left" valign="top">8 (2.2)</td><td align="left" valign="top">1 (0.3)</td></tr><tr><td align="left" valign="top">0 (no concerns)</td><td align="left" valign="top">323 (90.7)</td><td align="left" valign="top">292 (82.0)</td><td align="left" valign="top">315 (88.5)</td><td align="left" valign="top">352 (98.9)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>No notes were identified with bias with this severity score.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Summaries of total scores from the note quality assessments and the percentage of each note edited, as reported by the vendor, by specialty.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Specialty<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="2">Total score (n=356)</td><td align="left" valign="bottom" colspan="2">Percentage edited (n=960)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Notes, n (%)</td><td align="left" valign="top">Median (IQR; range)</td><td align="left" valign="top">Notes, n (%)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">Median (IQR; range)</td></tr></thead><tbody><tr><td align="left" valign="top">Dermatology</td><td align="left" valign="top">11 (3.1)</td><td align="left" valign="top">4.00 (3.50-4.00; 2.60&#x2010;4.00)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Family practice</td><td align="left" valign="top">182 (51.1)</td><td align="left" valign="top">4.00 (3.80-4.00; 1.80&#x2010;4.00)</td><td align="left" valign="top">571 (59.5)</td><td align="left" valign="top">8.3 (2.4-17.3; 0.0&#x2010;63.5)</td></tr><tr><td align="left" valign="top">Internal medicine</td><td align="left" valign="top">115 (32.3)</td><td align="left" valign="top">4.00 (3.60-4.00; 1.40&#x2010;4.00)</td><td align="left" valign="top">364 (37.9)</td><td align="left" valign="top">9.5 (2.1-25.5; 0.0&#x2010;100.0)</td></tr><tr><td align="left" valign="top">Obstetrics and gynecology</td><td align="left" valign="top">4 (1.1)</td><td align="left" valign="top">4.00 (3.80-4.00; 3.20&#x2010;4.00)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Otolaryngology</td><td align="left" valign="top">26 (7.3)</td><td align="left" valign="top">4.00 (4.00-4.00; 4.00&#x2010;4.00)</td><td align="left" valign="top">21 (2.2)</td><td align="left" valign="top">34.4 (21.2-46.8; 8.9&#x2010;91.2)</td></tr><tr><td align="left" valign="top">Pediatrics</td><td align="left" valign="top">18 (5.1)</td><td align="left" valign="top">3.80 (3.45-4.00; 3.00&#x2010;4.00)</td><td align="left" valign="top">4 (0.4)</td><td align="left" valign="top">22.4 (16.2-33.0; 10.7&#x2010;51.7)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>No quality assessments were conducted by the endocrinologist.</p></fn><fn id="table3fn2"><p><sup>b</sup>Due to technical difficulties with the data from the vendor, the percentage edited was only obtained at the note level for a portion of all notes and was not available for any notes for dermatology, obstetrics and gynecology, or endocrinology.</p></fn><fn id="table3fn3"><p><sup>c</sup>Percentage edited was not available for any artificial intelligence&#x2013;generated notes for this specialty.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Percentage Edited</title><p>Of the 7545 AI-created notes, the vendor metric of percentage of words retained was reported for 960 (12.7%) notes. Most of the AI-drafted note content was left unedited by physicians, with a median 9.0% (IQR 2.5%-21.9%; <xref ref-type="fig" rid="figure2">Figure 2</xref>) of AI-generated words changed. Notably, 14.9% (143/960) of the AI-generated notes were not edited at all. Individual editing practices varied substantially by physician, with the average percentage of words changed by physician ranging from 1.9% to 69.3% (<xref ref-type="fig" rid="figure1">Figure 1</xref>). At the note level, the percentage edited was similar for family practice (median 8.3%, IQR 2.4%-17.3%) and internal medicine (median 9.5%, IQR 2.1%-25.5%) but higher for otolaryngology (median 34.4%, IQR 21.2%-46.8%) and pediatrics (median 22.4%, IQR 16.2%-33.0%; <xref ref-type="table" rid="table3">Table 3</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Distribution of percentage of words changed in artificial intelligence&#x2013;generated clinic visit notes. The x-axis indicates the percentage of the note that was changed, and the y-axis indicates the number of notes with that level of editing.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e86474_fig02.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>Consistent with other studies, we found that most (337/356, 94.7%) AI scribe platform&#x2013;generated notes were of high quality and free from significant errors [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. However, accidental omissions were relatively common, identified in 18% (64/356) of the notes evaluated. Most errors were rated as low risk for patient harm; however, 5.3% (19/356) of AI-drafted notes contained an error deemed to pose a risk of serious harm if left uncorrected. Thus, careful clinician review of notes remains imperative.</p><p>Because we prioritized patient safety and specifically focused on identifying errors or bias in AI-generated notes, our evaluation tool to capture physician ratings was very brief, allowing rapid assessment of a large number of notes. This contrasts with the longer PDQI-9 instrument that evaluates overall document quality. Natural language processing metrics have also been used to evaluate note quality in previous studies [<xref ref-type="bibr" rid="ref13">13</xref>], but these metrics do not explicitly identify deviations that could impact patient safety. Our note quality assessment approach is therefore unique in rating the severity of risk posed to patients if the error is not corrected.</p><p>Several limitations were encountered in implementing and evaluating this program. First, this was a pilot program, and clinicians volunteered as early adopters of AI scribe technology within the health care system. Second, although the assessment rubric was short, most physicians in the pilot assessed fewer than the requested 20 notes, despite repeated outreach from the project team. Thus, our method of including participants in the pilot and their differential response rates could have biased our results. Third, because physicians reviewed notes from their own clinic visits, identification and rating of the severity of the errors could be biased due to the physician&#x2019;s individual perspective as well as the passage of time between the visit and note review. However, physicians tended to review notes the same day as the clinic visit, which would reduce recall bias. Finally, to achieve the objectives of our quality improvement project, we did not seek to develop a rigorously validated instrument. Severity ratings were subjective, based on individual physician&#x2019;s clinical judgment within the context of a particular patient, such that scores could vary among institutions and clinical departments, thus requiring local calibration. Future studies and improvements could include rigorous development of the instrument through multiple reviewer assessments of AI-generated notes relative to recorded transcripts, standardization of severity ratings, and external validation.</p><p>In the future, ongoing monitoring of the quality of AI-generated notes will be important to identify changes in the underlying algorithm or practices that could degrade note quality. While our assessment tool is time efficient, it still requires human review, which might impede its use for long-term monitoring. Importantly, we combined direct physicians' ratings with vendor-reported metrics that are more easily obtainable, specifically the percentage of each note that was edited, as a secondary indicator of note quality. Using direct feedback and reporting from end users as well as automatically generated information from platforms will be important for a long-term monitoring approach. Given the occurrence of errors in AI-generated notes, monitoring trends, both in the percentage of notes that are heavily edited or notes that are not edited, will facilitate early identification of changes in physician vigilance in note review. Finally, as our health care system moves toward broader implementation of an AI scribe platform, we are investigating the standardization of quality monitoring by correlating vendor metrics with physician ratings. Given that physician behavior can change over time when adopting AI tools into routine practice, and the performance of the tools themselves may drift, our health system has prioritized long-term monitoring of quality and safety.</p><p>In summary, we have outlined a pilot program that might be a generalizable model for other health care systems. Our focused priorities on quality and safety of AI-generated notes, while also maintaining feasibility in the evaluation, were essential for piloting and will likely remain so for broad-scale implementation. Furthermore, we continually engage with the AI vendor to inform their automated metrics that align with our ongoing evaluation priorities. Moving forward, we also plan to implement these approaches and our findings into clinician training in the use of AI technology. With the promise of AI, particularly tools aimed at reducing physician workload and burnout, there is a pathway for rapid deployment while maintaining high standards of patient care.</p></sec></body><back><notes><sec><title>Funding</title><p>Funding was provided by the National Center for Advancing Translational Sciences, National Institutes of Health, through grant UL1 TR001860.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: MJ, SM, CRL, JYA</p><p>Data curation: SH</p><p>Formal analysis: SLT, YR</p><p>Methodology: SLT, MJ, SM, JYA</p><p>Project administration: MJ, DA, BH</p><p>Resources: MJ, DA, BH</p><p>Supervision: SLT</p><p>Visualization: SLT, YR</p><p>Writing&#x2014;original draft: SLT, CRL, JYA</p><p>Writing&#x2014;review and editing: SLT, MJ, SM, SD, DA, BH, CRL, JYA</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">PDQI-9</term><def><p>Physician Documentation Quality Instrument</p></def></def-item><def-item><term id="abb3">UCDH</term><def><p>University of California, Davis Health</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>C</given-names> </name><name name-style="western"><surname>Vogt</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name></person-group><article-title>Prospects for AI clinical summarization to reduce the burden of patient chart review</article-title><source>Front Digit Health</source><year>2024</year><volume>6</volume><fpage>1475092</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2024.1475092</pub-id><pub-id pub-id-type="medline">39575412</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tierney</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Gayre</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hoberman</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Ambient artificial intelligence scribes to alleviate the burden of clinical documentation</article-title><source>NEJM Catal Innov Care Deliv</source><year>2024</year><month>02</month><day>21</day><volume>5</volume><issue>3</issue><pub-id pub-id-type="doi">10.1056/CAT.23.0404</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Devon-Sand</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>SP</given-names> </name><etal/></person-group><article-title>Ambient artificial intelligence scribes: physician burnout and perspectives on usability and documentation burden</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>02</month><day>1</day><volume>32</volume><issue>2</issue><fpage>375</fpage><lpage>380</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae295</pub-id><pub-id pub-id-type="medline">39657021</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Balloch</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sridharan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Oldham</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Use of an ambient artificial intelligence tool to improve quality of clinical documentation</article-title><source>Future Healthc J</source><year>2024</year><month>09</month><volume>11</volume><issue>3</issue><fpage>100157</fpage><pub-id pub-id-type="doi">10.1016/j.fhj.2024.100157</pub-id><pub-id pub-id-type="medline">39371531</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Albrecht</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shanks</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Enhancing clinical documentation with ambient artificial intelligence: a quality improvement survey assessing clinician perspectives on work burden, burnout, and job satisfaction</article-title><source>JAMIA Open</source><year>2025</year><month>02</month><day>21</day><volume>8</volume><issue>1</issue><fpage>ooaf013</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooaf013</pub-id><pub-id pub-id-type="medline">39991073</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mess</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Mackey</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Yarowsky</surname><given-names>DE</given-names> </name></person-group><article-title>Artificial intelligence scribe and large language model technology in healthcare documentation: advantages, limitations, and recommendations</article-title><source>Plast Reconstr Surg Glob Open</source><year>2025</year><month>01</month><day>16</day><volume>13</volume><issue>1</issue><fpage>e6450</fpage><pub-id pub-id-type="doi">10.1097/GOX.0000000000006450</pub-id><pub-id pub-id-type="medline">39823022</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Guevara</surname><given-names>M</given-names> </name><name name-style="western"><surname>Moningi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The effect of using a large language model to respond to patient messages</article-title><source>Lancet Digit Health</source><year>2024</year><month>06</month><volume>6</volume><issue>6</issue><fpage>e379</fpage><lpage>e381</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00060-8</pub-id><pub-id pub-id-type="medline">38664108</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Hond</surname><given-names>A</given-names> </name><name name-style="western"><surname>Leeuwenberg</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bartels</surname><given-names>R</given-names> </name><etal/></person-group><article-title>From text to treatment: the crucial role of validation for generative large language models in health care</article-title><source>Lancet Digit Health</source><year>2024</year><month>07</month><volume>6</volume><issue>7</issue><fpage>e441</fpage><lpage>e443</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00111-0</pub-id><pub-id pub-id-type="medline">38906607</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stetson</surname><given-names>PD</given-names> </name><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wrenn</surname><given-names>JO</given-names> </name><name name-style="western"><surname>Siegler</surname><given-names>EL</given-names> </name></person-group><article-title>Assessing electronic note quality using the Physician Documentation Quality Instrument (PDQI-9)</article-title><source>Appl Clin Inform</source><year>2012</year><volume>3</volume><issue>2</issue><fpage>164</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.4338/aci-2011-11-ra-0070</pub-id><pub-id pub-id-type="medline">22577483</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gerke</surname><given-names>S</given-names> </name><name name-style="western"><surname>Simon</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Roman</surname><given-names>BR</given-names> </name></person-group><article-title>Liability risks of ambient clinical workflows with artificial intelligence for clinicians, hospitals, and manufacturers</article-title><source>JCO Oncol Pract</source><year>2026</year><month>03</month><volume>22</volume><issue>3</issue><fpage>357</fpage><lpage>361</lpage><pub-id pub-id-type="doi">10.1200/OP-24-01060</pub-id><pub-id pub-id-type="medline">40749149</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Palm</surname><given-names>E</given-names> </name><name name-style="western"><surname>Manikantan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mahal</surname><given-names>H</given-names> </name><name name-style="western"><surname>Belwadi</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Pepin</surname><given-names>ME</given-names> </name></person-group><article-title>Assessing the quality of AI-generated clinical notes: validated evaluation of a large language model ambient scribe</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1691499</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1691499</pub-id><pub-id pub-id-type="medline">41199808</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cain</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Broder</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Quality assurance during the rapid implementation of an AI-assisted clinical documentation support tool</article-title><source>NEJM AI</source><year>2025</year><month>03</month><day>27</day><volume>2</volume><issue>4</issue><pub-id pub-id-type="doi">10.1056/AIcs2400977</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gebauer</surname><given-names>S</given-names> </name></person-group><article-title>Benchmarking and datasets for ambient clinical documentation: a scoping review of existing frameworks and metrics for AI-assisted medical note generation</article-title><source>medRxiv</source><comment>Preprint posted online on  Jan 29, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.01.29.25320859</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Note quality survey user interface and error types by specialty.</p><media xlink:href="medinform_v14i1e86474_app1.docx" xlink:title="DOCX File, 191 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Patient demographics.</p><media xlink:href="medinform_v14i1e86474_app2.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>