<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e68139</article-id><article-id pub-id-type="doi">10.2196/68139</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Transforming Informed Consent Generation Using Large Language Models: Mixed Methods Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Shi</surname><given-names>Qiming</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Luzuriaga</surname><given-names>Katherine</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Allison</surname><given-names>Jeroan J</given-names></name><degrees>MS, MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Oztekin</surname><given-names>Asil</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Faro</surname><given-names>Jamie M</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Joy L</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hafer</surname><given-names>Nathaniel</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>McManus</surname><given-names>Margaret</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zai</surname><given-names>Adrian H</given-names></name><degrees>MD, PhD, MPH</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Center for Clinical and Translational Science, University of Massachusetts Chan Medical School</institution><addr-line>55 N Lake Ave</addr-line><addr-line>Worcester</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Population and Quantitative Health Sciences, University of Massachusetts Chan Medical School</institution><addr-line>Worcester</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Manning School of Business, University of Massachusetts Lowell</institution><addr-line>Lowell</addr-line><addr-line>MA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Lovis</surname><given-names>Christian</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Trang</surname><given-names>Karen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Elbattah</surname><given-names>Mahmoud</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Qiming Shi, MS, Center for Clinical and Translational Science, University of Massachusetts Chan Medical School, 55 N Lake Ave, Worcester, MA, 01655, United States, 1 508-856-1952; <email>qiming.shi@umassmed.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>13</day><month>2</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e68139</elocation-id><history><date date-type="received"><day>29</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>02</day><month>01</month><year>2025</year></date><date date-type="accepted"><day>04</day><month>01</month><year>2025</year></date></history><copyright-statement>&#x00A9; Qiming Shi, Katherine Luzuriaga, Jeroan J Allison, Asil Oztekin, Jamie M Faro, Joy L Lee, Nathaniel Hafer, Margaret McManus, Adrian H Zai. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 13.2.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e68139"/><abstract><sec><title>Background</title><p>Informed consent forms (ICFs) for clinical trials have become increasingly complex, often hindering participant comprehension and engagement due to legal jargon and lengthy content. The recent advances in large language models (LLMs) present an opportunity to streamline the ICF creation process while improving readability, understandability, and actionability.</p></sec><sec><title>Objectives</title><p>This study aims to evaluate the performance of the Mistral 8x22B LLM in generating ICFs with improved readability, understandability, and actionability. Specifically, we evaluate the model&#x2019;s effectiveness in generating ICFs that are readable, understandable, and actionable while maintaining the accuracy and completeness.</p></sec><sec sec-type="methods"><title>Methods</title><p>We processed 4 clinical trial protocols from the institutional review board of UMass Chan Medical School using the Mistral 8x22B model to generate key information sections of ICFs. A multidisciplinary team of 8 evaluators, including clinical researchers and health informaticians, assessed the generated ICFs against human-generated counterparts for completeness, accuracy, readability, understandability, and actionability. Readability, Understandability, and Actionability of Key Information indicators, which include 18 binary-scored items, were used to evaluate these aspects, with higher scores indicating greater accessibility, comprehensibility, and actionability of the information. Statistical analysis, including Wilcoxon rank sum tests and intraclass correlation coefficient calculations, was used to compare outputs.</p></sec><sec sec-type="results"><title>Results</title><p>LLM-generated ICFs demonstrated comparable performance to human-generated versions across key sections, with no significant differences in accuracy and completeness (<italic>P</italic>&#x003E;.10). The LLM outperformed human-generated ICFs in readability (Readability, Understandability, and Actionability of Key Information score of 76.39% vs 66.67%; Flesch-Kincaid grade level of 7.95 vs 8.38) and understandability (90.63% vs 67.19%; <italic>P</italic>=.02). The LLM-generated content achieved a perfect score in actionability compared with the human-generated version (100% vs 0%; <italic>P</italic>&#x003C;.001). Intraclass correlation coefficient for evaluator consistency was high at 0.83 (95% CI 0.64-1.03), indicating good reliability across assessments.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The Mistral 8x22B LLM showed promising capabilities in enhancing the readability, understandability, and actionability of ICFs without sacrificing accuracy or completeness. LLMs present a scalable, efficient solution for ICF generation, potentially enhancing participant comprehension and consent in clinical trials.</p></sec></abstract><kwd-group><kwd>informed consent form</kwd><kwd>ICF</kwd><kwd>large language models</kwd><kwd>LLMs</kwd><kwd>clinical trials</kwd><kwd>readability</kwd><kwd>health informatics</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>AI in health care</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Ethical codes and regulations have been established globally to guide researchers in conducting studies involving human subjects. In the United States, the Belmont Report and the Common Rule are key frameworks for ensuring ethical research practices. The Common Rule, formally known as the &#x201C;Basic Department of Health and Human Services (HHS) Policy for the Protection of Human Research Subjects,&#x201D; requires that participants receive comprehensive information about the study&#x2019;s purpose, allowing them to make informed and autonomous decisions about their participation [<xref ref-type="bibr" rid="ref1">1</xref>]. This process of obtaining informed consent is fundamental to responsible conduct in research involving human subjects [<xref ref-type="bibr" rid="ref2">2</xref>]. However, in recent years, the inclusion of mandatory scientific content, legal jargon, and increasing length has turned the informed consent form (ICF) into a barrier to study participation [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Although many institutional review boards (IRBs) require investigators to develop documents written at the eighth-grade reading level [<xref ref-type="bibr" rid="ref5">5</xref>], research has found that research ICFs are frequently written at reading grade levels that far exceed readers&#x2019; abilities [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>In response to the increasing complexity and length of informed consent documentation, the Health and Human Services Office for Human Research Protections added a new requirement to the 2018 Common Rule, stipulating that ICFs must begin with &#x201C;a concise and focused presentation of the key information that is most likely to assist prospective subjects in understanding the reasons why one might or might not want to participate in the research&#x201D; and that it &#x201C;must be organized and presented in a way that facilitates comprehension&#x201D; [<xref ref-type="bibr" rid="ref1">1</xref>]. The Secretary&#x2019;s Advisory Committee on Human Research Protections has recommended conducting empirical research to guide the writing of the new key information section in light of the new consent requirement, ensuring that its goals are effectively met [<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>With the advancement of large language models (LLMs), a possible solution to improving ICF has emerged. LLMs show significant potential in health informatics, including tasks such as name entity extraction [<xref ref-type="bibr" rid="ref11">11</xref>], patient trial matching [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>], biomedical reasoning and classification [<xref ref-type="bibr" rid="ref14">14</xref>], prediction of admissions [<xref ref-type="bibr" rid="ref15">15</xref>], automation of administrative tasks [<xref ref-type="bibr" rid="ref16">16</xref>], and so forth. Studies have also shown that LLMs can effectively enhance the documentation of risks, benefits, and alternatives for common surgical procedures [<xref ref-type="bibr" rid="ref17">17</xref>]. The integration of LLMs in clinical workflows could significantly reduce administrative burden by automating labor-intensive tasks such as ICF creation. However, for successful implementation, models must not only improve readability and actionability but also align with current regulatory requirements and ethical guidelines.</p><p>The ability for LLMs to generate a complex clinical trial ICFs from a research protocol remains unexplored. This paper aims to evaluate the performance of the Mistral 8x22B LLM in generating the key information sections of ICFs with improved readability, understandability, and actionability. Specifically, our objectives are to assess the model&#x2019;s effectiveness in producing ICFs that meet readability standards, enhance understandability, and support actionable content while maintaining accuracy and completeness. Furthermore, we hypothesize that LLM-generated ICFs outperform human-generated counterparts in readability, understandability, and actionability, without compromising on the accuracy or completeness of information.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>We sourced 4 research protocols from the UMass Chan IRB, along with their corresponding ICFs. These protocols were then processed by our LLM model to generate artificial intelligence (AI)&#x2013;generated ICFs, resulting in a total of 8 ICFs&#x2014;4 human-generated and 4 AI-generated. Each research protocol, along with its respective human and AI-generated ICFs, was randomly assigned to evaluators for assessment. We had 8 evaluators in total, ensuring that each protocol set was reviewed twice by 2 different evaluators. A multidisciplinary team of 8 evaluators, including health informaticians, clinical researchers, and physicians, was assembled to review the outputs. Importantly, the evaluators were not investigators in the clinical trials whose ICFs they assessed and were not directly affiliated with the specific studies under review. Care was also taken to ensure that the evaluators and the investigators for each protocol were from different departments within the institution. Furthermore, none of the evaluators were members of the IRB that reviewed and approved the protocols. These measures were implemented to minimize potential bias and ensure objective evaluation.</p><p>Each protocol set was evaluated by 2 different reviewers, ensuring comprehensive assessment. The evaluation focused on key criteria: completeness, accuracy, readability, understandability, and actionability of the generated content. To mitigate potential evaluator bias, we ensured that each protocol was randomly assigned and evaluated by multiple individuals from different disciplines. This multidisciplinary approach, combined with random assignment, reduces the risk of personal bias and ensures a more comprehensive assessment of both LLM and human-generated ICFs.</p></sec><sec id="s2-2"><title>Study Protocols</title><p>The 4 protocols included in this study were selected to ensure diversity in study design, therapeutic areas, and patient populations. This approach was aimed at evaluating the generalizability of LLM-generated ICFs across varied research contexts. <xref ref-type="table" rid="table1">Table 1</xref> summarizes the key attributes of the protocols.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of study protocols.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study title</td><td align="left" valign="bottom">Study type</td><td align="left" valign="bottom">Domain</td></tr></thead><tbody><tr><td align="left" valign="top">Kangaroo Mother Care Study</td><td align="left" valign="top">Qualitative study</td><td align="left" valign="top">Neonatology</td></tr><tr><td align="left" valign="top">Characterization of Oral Microbiome in Patients With Viral Respiratory Illness</td><td align="left" valign="top">Observational cohort study</td><td align="left" valign="top">Infectious Diseases, Microbiome</td></tr><tr><td align="left" valign="top">RADx Tech COVID-19 Test Us Study</td><td align="left" valign="top">Platform trial</td><td align="left" valign="top">Infectious Diseases, Diagnostics</td></tr><tr><td align="left" valign="top">Healthy at Home Pilot</td><td align="left" valign="top">Pilot feasibility trial</td><td align="left" valign="top">Pulmonology, Digital Health</td></tr></tbody></table></table-wrap></sec><sec id="s2-3"><title>LLM Model</title><p>We chose the Mistral 8x22B model, the latest offering from Mistral [<xref ref-type="bibr" rid="ref18">18</xref>], for several compelling reasons:</p><list list-type="order"><list-item><p>Large Context Window: With a 64K token context window, this model can manage extensive research protocols. It is ideal for accurately recalling information from large documents such as clinical trial research protocols.</p></list-item><list-item><p>Multilingual Fluency: The Mistral 8x22B excels in multiple languages, aligning with our objective of using LLMs to create ICFs that ensure fair recruitment and serve underrepresented populations. Producing ICFs in various languages, such as Spanish, is highly advantageous.</p></list-item><list-item><p>Open-Source License: The Mistral 8x22B is available under an Apache 2.0 open-source license [<xref ref-type="bibr" rid="ref19">19</xref>], allowing unrestricted deployment. This flexibility is beneficial as we roll out the final product.</p></list-item></list></sec><sec id="s2-4"><title>ICF Key Information Section Integration</title><p>We downloaded ICF templates from various institutions, including the University of California San Francisco, Yale University, Duke University, New York University, the University of Pennsylvania, Johns Hopkins University, Partners HealthCare, Stanford University, Vanderbilt University, and the UMass Chan Medical School. We then consolidated the key information section instructions provided by these institutions into a comprehensive format. To reinforce this format, we made modifications following the Readability, Understandability, and Actionability of Key Information (RUAKI) indicator, ensuring that our consolidated key information sections create more accessible information. The final version of this format serves as the key information section instruction input for the LLM model.</p></sec><sec id="s2-5"><title>Prompt Engineering</title><p>To create the ICF key information content, we used Mistral artificial intelligence (AI) in conjunction with prompt engineering guidance developed by the Research Informatics Core as part of a human-in-the-loop process. This team included the chief research information officer, 2 clinical data scientists, and an IRB officer. The prompt creation followed a backward design instructional approach [<xref ref-type="bibr" rid="ref20">20</xref>]. The consolidated key information section instructions were used to design the prompts. The data scientists then took this guidance and crafted prompts to align with these instructions.</p><p>We used a Least-to-Most approach to guide the AI through the process of creating the consent forms. This step-by-step approach ensured that the AI received small, manageable instructions at each stage, helping it produce more accurate and reliable outputs. By breaking down tasks into smaller steps rather than overwhelming the AI with multiple instructions at once, we reduced confusion and enhanced the quality of the AI-generated forms. After designing and developing the key information section, the output was rated and reviewed by the Research Informatics Core team. Based on their feedback, the prompts were edited to enhance the model&#x2019;s performance.</p><p>We started by creating the chatbot prompt detailed in Supplementary 1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> to extract relevant information for each key section from the research protocol. Next, as described in Supplementary 2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, we refined the output using RUAKI indicators. In Supplementary 3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, we adjusted the content to achieve Flesch-Kincaid grade levels below 8. Finally, in Supplementary 4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, we formatted the output to align with our preferred forms, again guided by RUAKI indicators.</p></sec><sec id="s2-6"><title>Measurements of Accuracy and Completeness</title><p>To evaluate accuracy and completeness, we developed a scoring system based on recommendations from LeapFrog (VTech Group), The Joint Commission, the American College of Surgeons, and relevant available literature (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]. Key information sections, including study purpose, duration and procedures, risks and discomforts, benefits, and alternatives, were assessed as complete, incomplete, absent, or incorrect, with corresponding scores of 3, 2, 1, and 0, respectively.</p></sec><sec id="s2-7"><title>Measurements of Readability, Understandability, and Actionability</title><p>We used the RUAKI indicator to evaluate readability, understandability, and actionability presented in ICFs key information section [<xref ref-type="bibr" rid="ref25">25</xref>]. This indicator consists of 18 items, each assessed with a binary rating of &#x201C;yes&#x201D; (scored as 1) or &#x201C;no&#x201D; (scored as 0) (<xref ref-type="table" rid="table2">Table 2</xref>). To determine the final score, we sum the number of &#x201C;yes&#x201D; responses, divide by the total number of items (18), and multiply by 100 to yield a percentage score. The section score for readability, understandability, and actionability was derived by dividing the final score of each section by the total number of relevant items. A higher percentage indicates that the key information is more accessible, comprehensible, and actionable.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Readability, understandability, and actionability of key information evaluation criteria.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Category and item number</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Rating</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Readability</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1</td><td align="left" valign="top">Active voice: uses active verbs (eg, will use) rather than passive verbs (eg, will be used) all or most of the time, more than 90% of the time.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2</td><td align="left" valign="top">Word choice: avoids scientific jargon (eg, hypertension). Uses words readers are familiar with (eg, high blood pressure) all or most of the time, more than 90% of the time.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">3</td><td align="left" valign="top">Topic definition: provides a definition of the main disease or topic the study is about.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">4</td><td align="left" valign="top">Numbers: avoids mathematical calculations including comparison of numeric probability of risk.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">5</td><td align="left" valign="top">Eighth grade or below: reading grade level calculated in Microsoft Word is Flesch-Kincaid grade level 8.9 or below.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">6</td><td align="left" valign="top">Headers: sections or chunks of information are labeled with headers. Headers clearly describe sections so that readers can scan and find information.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">7</td><td align="left" valign="top">Font type and size: font type or style is easy to read. Font size is at least 11&#x2010;12 point.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">8</td><td align="left" valign="top">White space: uses bulleted or numbered lists to increase white space on the page.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">9</td><td align="left" valign="top">Image: contains at least 1 image that is related to the topic of the study. Not a logo.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top" colspan="4">Understandability</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">10</td><td align="left" valign="top">Purpose of the study: includes a statement that says, &#x201C;the purpose of the study is&#x2026;&#x201D; Purpose of the study is stated, rather than implied.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">11</td><td align="left" valign="top">Main reason to join the study&#x2014;benefits: includes description or list of potential benefits to participants or others.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">12</td><td align="left" valign="top">Main reasons not to join the study&#x2014;risks: includes description or list of potential side effects or risks to participants.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">13</td><td align="left" valign="top">Information being collected: describes the information that will be collected from participants and about participants.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">14</td><td align="left" valign="top">Study procedures: describes what participants will need to do AND how much time it will take.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">15</td><td align="left" valign="top">Study is research: includes a statement that says, &#x201C;study is research&#x201D; or &#x201C;research study&#x201D; not just consenting to treatment.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">16</td><td align="left" valign="top">Participation is voluntary: states that participation is voluntary and that participants have a choice to be in the study or not.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">17</td><td align="left" valign="top">Costs and compensation: describes any financial payments (or costs) to study participants.</td><td align="left" valign="top">Yes=1/No=0</td></tr><tr><td align="left" valign="top" colspan="4">Actionability</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">18</td><td align="left" valign="top">Consent process: describes the process by which the reader gives his or her consent by signing a document, verbal agreement, via computer, or other.</td><td align="left" valign="top">Yes=1/No=0</td></tr></tbody></table></table-wrap></sec><sec id="s2-8"><title>Statistical Analysis</title><p>We reported mean accuracy and completeness scores for human-generated and LLM-generated ICF key information sections. We compared the mean accuracy and completeness scores of human-generated and LLM-generated ICF key information sections using Wilcoxon rank sum tests. Furthermore, we compared the RUAKI indicators between the 2 groups using the Wilcoxon rank sum test. Moreover, we measured the intraclass correlation coefficient (ICC) to assess the consistency among raters. An ICC below 0.5 indicates low reliability, between 0.5 and 0.74 indicates moderate reliability, from 0.75 to 0.9 suggests good reliability, and above 0.9 signifies excellent reliability [<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>This study qualifies as nonhuman subjects research under applicable institutional and regulatory guidelines, as it exclusively involved evaluators who are also coauthors of this work. No external participants were involved, and no identifiable private information was collected, analyzed, or shared. Consequently, this work did not require review or approval from an IRB.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>The accuracy and completeness of the LLM- and human-generated outputs were comparable across key sections of the ICFs (<xref ref-type="table" rid="table3">Table 3</xref>). Both the LLM and human outputs achieved similar scores for conveying the study purpose (2.88 vs 2.63), with no significant difference (<italic>P</italic>=.16). For the duration and procedures, the scores were also close (2.5 vs 2.38), with no statistically significant difference (<italic>P</italic>=.56). The LLM slightly outperformed the human output in explaining the risks and discomforts (2.63 vs 2.38), but again, this difference was not statistically significant (<italic>P</italic>=.32). In terms of benefits, the LLM achieved a perfect score of 3.0 compared with the human output&#x2019;s 2.57, although this difference approached but did not reach statistical significance (<italic>P</italic>=.10). Both the LLM and human outputs were identical in discussing alternatives, scoring 2.75 (<italic>P</italic>&#x2265;.99). For the overall impression, the LLM scored 2.63 compared with the human output&#x2019;s 2.31, with no statistically significant difference (<italic>P</italic>=.32). Overall, both outputs displayed comparable levels of performance across these key sections.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Mean accuracy and completeness scores for human and large language model evaluations across key informed consent sections.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">LLM output, mean score (SD)</td><td align="left" valign="bottom">Human output, mean score (SD)</td><td align="left" valign="bottom">Wilcoxon rank sum tests, <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Study purpose</td><td align="left" valign="top">2.88 (0.35)</td><td align="left" valign="top">2.63 (0.52)</td><td align="left" valign="top">.16</td></tr><tr><td align="left" valign="top">Duration and procedures</td><td align="left" valign="top">2.5 (0.53)</td><td align="left" valign="top">2.38 (0.52)</td><td align="left" valign="top">.56</td></tr><tr><td align="left" valign="top">Risks and discomforts</td><td align="left" valign="top">2.63 (0.52)</td><td align="left" valign="top">2.38 (0.52)</td><td align="left" valign="top">.32</td></tr><tr><td align="left" valign="top">Benefits</td><td align="left" valign="top">3 (0)</td><td align="left" valign="top">2.57 (0.79)</td><td align="left" valign="top">.10</td></tr><tr><td align="left" valign="top">Alternatives</td><td align="left" valign="top">2.75 (0.46)</td><td align="left" valign="top">2.75 (0.46)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Overall impression</td><td align="left" valign="top">2.63 (0.52)</td><td align="left" valign="top">2.31 (0.59)</td><td align="left" valign="top">.32</td></tr></tbody></table></table-wrap><p>The comparison of Mean RUAKI scores for ICF key information generated by LLMs versus human output reveals that the LLM consistently outperforms human-generated content in critical areas (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Although both the LLM and human outputs achieved relatively high readability scores, with the LLM slightly ahead (76.39% vs 66.67%), this difference approached but did not reach statistical significance (<italic>P</italic>=.26). The LLM demonstrated significantly better understandability, scoring 90.63% compared with the human score of 67.19%, with a statistically significant <italic>P</italic> value of .015. Moreover, the LLM consistently included an actionable next step at the end of the document, a crucial element that the human output failed to provide, as evidenced by the LLM&#x2019;s perfect actionability score of 100% compared with 0% for the human output. Overall, the LLM&#x2019;s content achieved a significantly higher combined score (84.03% vs 61.82%), with a statistically significant <italic>P</italic> value of .008, demonstrating that LLM-generated text is generally more effective in producing ICF key information sections that are not only easier to read but also more understandable and actionable for participants.</p><p>While both the LLM- and human-generated ICFs exhibited similar grade levels, the LLM generated content at a slightly lower grade level (7.95 vs 8.375), indicating that it is easier to read and better aligned with the recommended reading level for general audiences. However, this difference was not statistically significant (<italic>P</italic>=.77), suggesting comparable readability between the two. Nonetheless, the LLM&#x2019;s content remains closer to the target readability level, providing a subtle advantage in ensuring accessibility for a wider audience.</p><p>The ICC score for the average ratings across the raters was found to be 0.83 (95% CI 0.64-1.03). According to the general interpretation guidelines for ICC values, this score indicates good reliability.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Comparison of AI- and human-generated informed consent form performance: mean Readability, Understandability, and Actionability of Key Information scores with CIs and Wilcoxon signed rank test <italic>P</italic> values for readability, understandability, actionability, and Flesch-Kincaid grade level. AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e68139_fig01.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study evaluated the performance of the Mistral 8x22B LLM in generating key information sections for ICFs in clinical trials. The comparison between LLM-generated content and human-generated ICFs revealed that LLMs demonstrate considerable potential for improving the efficiency, readability, and actionability of ICFs, while maintaining comparable accuracy and completeness across most assessed categories.</p><sec id="s4-1-1"><title>Accuracy and Completeness</title><p>The LLM-generated ICFs achieved comparable performance to human-generated content across most areas. Both LLM and human outputs were similar in conveying the purpose of the study. They also performed equally well in describing the duration and procedures. While the human-generated content slightly outperformed the LLM in discussing alternatives, the LLM performed better in explaining the benefits. The overall impression score favored the LLM slightly. These results suggest that while the LLM performs similarly to humans in most areas, further refinement in prompt engineering may be required to improve its performance in more complex sections, such as alternatives. With additional fine-tuning, LLMs could potentially match or exceed the quality of human-generated ICFs across all categories.</p></sec><sec id="s4-1-2"><title>Readability</title><p>The LLM outperformed human-generated ICFs in readability, as demonstrated by higher RUAKI scores. Both LLM- and human-generated ICFs exhibited good readability according to the Flesch-Kincaid grade level, but the LLM achieved a lower average grade level, reflecting superior readability and closer alignment with institutional requirements for eighth-grade reading level content. This highlights the LLM&#x2019;s strength in tackling one of the primary challenges of ICF creation: producing documents that are both comprehensive and easily understood by a general audience. Given that many ICFs often exceed the recommended reading level, the LLM&#x2019;s consistent ability to generate readable content is a significant advantage, ensuring accessibility without sacrificing detail.</p></sec><sec id="s4-1-3"><title>Understandability and Actionability</title><p>The LLM significantly outperformed human-generated ICFs in both understandability and actionability, as reflected in the higher RUAKI scores. The LLM&#x2019;s output was not only more comprehensible but also consistently included actionable next steps, a critical component that was often missing from the human-generated content. The perfect actionability score of the LLM-generated ICFs suggests that these models can enhance participant comprehension and facilitate informed decision-making. These findings demonstrate the potential of LLMs to create ICFs that are not only easier to read but also more effective in guiding participants through the consent process.</p></sec><sec id="s4-1-4"><title>Rater Consistency</title><p>The ICC of 0.83 indicates a very high level of agreement among raters, reflecting the reliability of the evaluation process. The narrow CI further supports the robustness of these ratings, ensuring the consistency and validity of the results across different protocols.</p></sec></sec><sec id="s4-2"><title>Lessons Learned</title><sec id="s4-2-1"><title>Lesson 1: The Importance of Precise Temperature Settings</title><p>For all LLMs, the model temperature parameter controls the diversity of responses. A higher temperature, such as 0.8, produces more varied answers, while a lower temperature, such as 0.2, results in more focused and deterministic outputs. In our experiments, we found that setting the temperature to 0 was the most effective choice for this task. Temperatures introduce a level of randomness that can lead to hallucinations&#x2014;unwanted deviations from the source material. Since our goal is to extract information directly from the research protocol and treat it as the sole source of truth, it is essential to minimize any creative output from the LLM. Although a temperature of 0.2 is already fairly focused [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], the need for absolute accuracy in clinical trial documents led us to set the temperature to 0, ensuring that the content remains strictly aligned with the provided data.</p></sec><sec id="s4-2-2"><title>Lesson 2: Addressing Readability Challenges With Cross-Model Few-Shot Prompting</title><p>It was observed that Mistral struggled to generate content at a Flesch-Kincaid grade level of 8 when prompted directly with instructions such as, &#x201C;The content should meet literacy standards, specifically an 8th-grade reading level or lower.&#x201D; To overcome this challenge, unlike the other zero-shot prompts used in this project, additional prompts and a technique known as &#x201C;few-shot training&#x201D; [<xref ref-type="bibr" rid="ref29">29</xref>] were introduced. This involved providing the model with examples of text at both below 8 and above 9 Flesch-Kincaid grade levels, helping to guide the model in producing content at the desired reading level. These examples, generated using ChatGPT 4, were incorporated (as shown in Supplementary 3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) enabling Mistral to produce content more consistently at the desired grade level 8 or below. This approach, known as cross-model few-shot prompting, involves using one model to generate examples (or &#x201C;shots&#x201D;) that are subsequently fed into another model to enhance its performance on a specific task. It is essential to apply this step after any content-editing prompts are used, as further content edits could inadvertently raise the reading level above the target. Format editing should be the final step in the process, as any subsequent prompts could alter the formatting.</p></sec><sec id="s4-2-3"><title>Lesson 3: Least-to-Most Prompt Engineering</title><p>Effective prompt engineering involves breaking down tasks into manageable steps. When LLMs are given multiple instructions in a single prompt, they often struggle to follow all directions accurately. By adopting a Least-to-Most approach [<xref ref-type="bibr" rid="ref30">30</xref>]&#x2014;where each prompt contains a focused set of instructions and builds on the previous output&#x2014;we achieved more consistent and reliable results. This is analogous to how clinical workflows are built iteratively in electronic health record (EHR) systems to ensure accuracy in decision-making. Much like building clinical templates or order sets, prompt engineering ensures that each phase of content generation is guided to avoid ambiguity, ensuring accuracy and relevance to the context of informed consent. For example, if we were building a medication alert in an EHR, breaking the alert logic into separate steps&#x2014;from checking allergies to suggesting alternatives&#x2014;ensures clarity and avoids overwhelming the user. Similarly, breaking down the prompt for generating ICF sections helps the LLM focus on retrieving the right information from the protocol. This method involves 2 stages: first, decomposing a complex problem into a series of simpler subproblems and then sequentially solving these subproblems, with each solution informed by the answers to the previous ones. By guiding the LLM to work incrementally, we not only enhanced its accuracy but also ensured that human oversight remained integral to the process, leading to optimal outcomes and greater control over the final product. For instance, one reviewer noted that the LLM-generated procedures were missing some procedure information. This issue likely stemmed from the lengthy and poorly structured procedure section in the original research protocol. To address this, we designed a workflow in which we first asked the LLM to summarize the study&#x2019;s procedures and timeline. We then instructed the LLM to extract the necessary information from this summarized output. This step-by-step Least-to-Most approach allowed us to successfully extract the missing information and integrate it into the key information.</p></sec><sec id="s4-2-4"><title>Lesson 4: Real-World Application and Integration Challenges</title><p>Implementing LLMs in clinical workflows requires more than just improving readability or accuracy&#x2014;it necessitates seamless integration with existing clinical systems and processes, such as EHRs and IRB workflows. For LLMs to have a real-world impact, models need to be adaptable to diverse clinical environments and meet regulatory and ethical standards. One way to ensure this is by developing interfaces that allow researchers to fine-tune LLM outputs while ensuring compliance with clinical trial guidelines.</p></sec><sec id="s4-2-5"><title>Lesson 5: The Need for Detailed Source Material and Human Oversight</title><p>In one research protocol, a reviewer found that the human-generated ICF was more accurate in detailing risks and discomforts compared with the LLM-generated version. This discrepancy was evident in the omission of common discomforts associated with COVID-19 tests in the LLM-generated ICF, which occurred because this information was not included in the original research protocol. This underscores a key lesson: &#x201C;garbage in, garbage out.&#x201D; For LLMs to produce a comprehensive and accurate ICF, the original research protocol must be thorough and detailed. Furthermore, this finding highlights the importance of having a human-in-the-loop to review and refine the output from LLMs. While LLMs can significantly reduce the effort required to create an ICF&#x2014;potentially saving up to 90% of the work&#x2014;the final product still benefits from human oversight. For example, after generating a well-structured ICF key information section with the LLM, researchers can easily tweak the content to better suit their specific audience. One reviewer noted that the LLM-generated ICF had a more technical and clinical tone than the human-generated version. By having researchers to customize the LLM-generated content, the ICF can be tailored to the study audience, while most of the heavy lifting has been accomplished by the LLM.</p></sec><sec id="s4-2-6"><title>Lesson 6: Ethical and Regulatory Considerations for LLM-Generated Content</title><p>With the increasing role of LLMs in generating participant-facing documents, ethical and regulatory concerns must be addressed. Key considerations include ensuring that AI-generated ICFs do not inadvertently introduce bias or misinformation. Furthermore, as LLMs take on more responsibility in clinical settings, regulatory bodies may need to establish guidelines to govern their use. These guidelines could include stipulations on the necessity of human oversight to verify that LLM-generated content is accurate, participant-friendly, and compliant with ethical standards for informed consent.</p></sec><sec id="s4-2-7"><title>Lesson 7: Techniques for Structured and Accurate Information Extraction</title><p>By using effective prompt engineering strategies [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>] and crafting prompts that were precisely focused on extracting information from the research protocol, we were able to generate a well-structured and neatly formatted output. Key components&#x2014;such as the introduction, study purpose, procedures, risks and discomforts, benefits, alternatives, cost and compensation, and consent process&#x2014;were clearly delineated with headers, which improved the organization of the document and made it easier to locate specific information. When crafting the initial prompt to extract key information, we used a combination of techniques:</p><list list-type="order"><list-item><p>Delimiter usage: Delimiters like ### and [] were used to clearly define boundaries between different sections of the text.</p></list-item><list-item><p>Role-playing: Assigning the LLM a specific role, such as &#x201C;As a Clinical Trial Informed Consent Writer,&#x201D; provided contextual guidance, resulting in improved performance by making the model&#x2019;s responses more relevant and focused.</p></list-item></list></sec><sec id="s4-2-8"><title>Lesson 8: Addressing Practical Benefits and Cost-Savings</title><p>Beyond improving the quality of ICFs, LLMs offer the potential to reduce operational costs and administrative burdens in clinical trials. Automating the creation of ICFs can significantly cut down the time spent on document preparation while maintaining compliance with regulatory standards. By quantifying these savings&#x2014;such as estimating the reduction in hours spent on ICF creation&#x2014;future studies could further demonstrate the tangible benefits of incorporating LLMs into clinical workflows.</p></sec><sec id="s4-2-9"><title>Lesson 9: Anticipating Future Advances</title><p>As of this publication, GPT-4o mini [<xref ref-type="bibr" rid="ref33">33</xref>], Mistral Large 2 [<xref ref-type="bibr" rid="ref34">34</xref>], and Meta Llama 3.1 [<xref ref-type="bibr" rid="ref35">35</xref>] have been released, each featuring an expanded context window of 128k tokens, making them ideal for these tasks. However, they were not available during the development phase. While 64k tokens are sufficient for handling most clinical research protocols, for more extensive content, these new models would be preferable. That said, the prompts are compatible with all of these models.</p></sec></sec><sec id="s4-3"><title>Implications for Practice</title><p>The use of LLMs to generate ICFs offers considerable potential for streamlining the informed consent process. By producing more readable, understandable, and actionable content, LLMs can enhance participant comprehension and engagement, potentially improving recruitment and retention in clinical trials. Furthermore, the time savings associated with automated ICF generation can reduce the workload on researchers while ensuring that ICFs remain aligned with regulatory standards for readability and content clarity.</p></sec><sec id="s4-4"><title>Limitations and Future Directions</title><p>This study&#x2019;s findings must be interpreted in light of certain limitations. The LLM&#x2019;s performance is closely tied to the quality of the input research protocol. The LLM&#x2019;s performance is closely tied to the clarity and completeness of the source material. Ambiguities or inconsistencies in the research protocols can hinder the model&#x2019;s ability to capture all relevant details to generate accurate and comprehensive ICFs. Future research should focus on improving the clarity of source materials and refining prompt-engineering strategies to optimize LLM performance, particularly in more complex sections such as study procedures.</p><p>To address these challenges with procedural details, we used a targeted prompt-engineering approach, which involved having the LLM first summarize the study&#x2019;s procedures and timeline, and then extract specific details from the summarized text. This method improved accuracy, but ongoing refinement of these strategies is needed to enhance the LLM&#x2019;s ability to process complex and lengthy sections more effectively.</p><p>Another limitation relates to the potential recognizability of LLM-generated text. Although evaluators were blinded to the source of the ICFs and presented with human- and LLM-generated documents in a randomized order, the distinctive textual style of LLM outputs&#x2014;characterized by active voice, well-organized structure, simplified language, and consistent adherence to readability guidelines&#x2014;may have inadvertently revealed their origin. This recognizability could introduce subconscious bias into the evaluation process. To address this in future studies, we plan to use text obfuscation techniques, such as paraphrasing or reformatting outputs, to minimize stylistic differences and ensure true blinding. This approach will help strengthen the validity of future comparisons.</p><p>Another important limitation of this study is the small sample size, which consisted of only 4 clinical trial protocols. While these protocols provided a useful test bed, the relatively small sample size limits the generalizability of the findings. Future studies should incorporate a broader range of clinical trials from diverse therapeutic areas, phases, and levels of complexity to fully validate the model&#x2019;s performance. Furthermore, the limited sample size may have contributed to some statistically nonsignificant results, such as those related to procedural details or study alternatives. A larger sample would provide greater statistical power, enabling the detection of smaller but practically significant differences between human- and LLM-generated ICFs.</p><p>A larger sample would also better capture the diversity of challenges involved in ICF creation, such as variations in regulatory requirements, medical procedure complexity, and considerations for vulnerable populations. Future research should aim to assess the LLM&#x2019;s robustness and adaptability across a wider array of clinical trial contexts.</p><p>The process of designing and refining prompts, as well as generating the AI-generated ICFs, required a moderate time investment during initial development. However, leveraging the existing prompts and lessons learned in this study would enable future users to complete the process more efficiently and at reduced cost, enhancing the scalability of this approach.</p><p>The 0% actionability score for human-generated ICFs reflects a structural issue rather than a methodological flaw. Most institutions do not include explicit actionable sections in the key information portions of their templates. Updating these templates to include actionable instructions would likely improve scores significantly. This highlights a strength of LLM-generated ICFs, which inherently include actionable elements, enhancing the clarity and use of consent forms.</p><p>While the LLM demonstrated strong performance, there were instances where it missed details related to the duration and procedural elements of the study. This likely stems from 2 primary challenges: ambiguous or inconsistent presentation of these sections in the research protocols and the verbosity of the text, which can hinder the LLM&#x2019;s ability to process details efficiently. Our targeted approach of summarizing and then extracting procedural details helped address this issue, but further enhancements are needed to ensure that the LLM can consistently handle such challenges.</p><p>Looking ahead, our next goal is to automate the creation of entire ICFs directly from research protocols. This would significantly reduce the time and effort required for ICF development while maintaining consistency and quality. While this study highlights the potential for human oversight to address issues in LLM-generated content, our future research will aim to quantify the time and effort required for revisions to better assess the practical efficiency gains of integrating these models into clinical workflows. We also plan to explore the LLM&#x2019;s multilingual capabilities to generate ICFs in multiple languages, broadening the recruitment base and promoting diversity and equity in clinical trials. Ensuring that non&#x2013;English-speaking participants receive ICFs that are as readable and understandable as those in English is crucial for improving inclusivity and representation in research. Future work should also prioritize a thorough examination of ethical concerns, including potential biases in AI-generated content, the need for transparency in AI decision-making processes, and the legal implications of deploying LLMs in clinical trial workflows, to ensure that these tools are implemented responsibly and equitably.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study highlights the potential of LLMs to improve the efficiency and quality of ICF generation in clinical trials. While human oversight remains necessary to ensure accuracy in complex sections, and the findings are constrained by the small dataset and evaluation of a single LLM model, LLMs demonstrated potential advantages in producing more readable, understandable, and actionable ICF content. As LLM technology continues to evolve, it holds the promise of further enhancing the informed consent process by facilitating the creation of ICFs that are both participant-friendly and compliant with regulatory standards, thereby improving ethical conduct in clinical research.</p></sec></sec></body><back><ack><p>Research reported in this publication was supported by the National Center for Advancing Translational Sciences of the National Institutes of Health under award number UL1-TR001453. However, the content is solely the authors' responsibility and does not necessarily represent the official views of the National Institutes of Health.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb3">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb4">ICF</term><def><p>informed consent form</p></def></def-item><def-item><term id="abb5">IRB</term><def><p>institutional review board</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">RUAKI</term><def><p>Readability, Understandability, and Actionability of Key Information</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Part 46: protection of human subjects</article-title><source>Code of Federal Regulations</source><year>2018</year><month>07</month><day>19</day><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ecfr.gov/on/2018-07-19/title-45/subtitle-A/subchapter-A/part-46">https://www.ecfr.gov/on/2018-07-19/title-45/subtitle-A/subchapter-A/part-46</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tamariz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Palacio</surname><given-names>A</given-names> </name><name name-style="western"><surname>Robert</surname><given-names>M</given-names> </name><name name-style="western"><surname>Marcus</surname><given-names>EN</given-names> </name></person-group><article-title>Improving the informed consent process for research subjects with low literacy: a systematic review</article-title><source>J Gen Intern Med</source><year>2013</year><month>01</month><volume>28</volume><issue>1</issue><fpage>121</fpage><lpage>126</lpage><pub-id pub-id-type="doi">10.1007/s11606-012-2133-2</pub-id><pub-id pub-id-type="medline">22782275</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Emanuel</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Boyle</surname><given-names>CW</given-names> </name></person-group><article-title>Assessment of length and readability of informed consent documents for COVID-19 vaccine trials</article-title><source>JAMA Netw Open</source><year>2021</year><month>04</month><day>1</day><volume>4</volume><issue>4</issue><fpage>e2110843</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2021.10843</pub-id><pub-id pub-id-type="medline">33909052</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grant</surname><given-names>SC</given-names> </name></person-group><article-title>Informed consent&#x2014;we can and should do better</article-title><source>JAMA Netw Open</source><year>2021</year><month>04</month><day>1</day><volume>4</volume><issue>4</issue><fpage>e2110848</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2021.10848</pub-id><pub-id pub-id-type="medline">33909058</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hadden</surname><given-names>KB</given-names> </name><name name-style="western"><surname>Prince</surname><given-names>LY</given-names> </name><name name-style="western"><surname>Moore</surname><given-names>TD</given-names> </name><name name-style="western"><surname>James</surname><given-names>LP</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Trudeau</surname><given-names>CR</given-names> </name></person-group><article-title>Improving readability of informed consents for research at an academic medical institution</article-title><source>J Clin Trans Sci</source><year>2017</year><month>12</month><volume>1</volume><issue>6</issue><fpage>361</fpage><lpage>365</lpage><pub-id pub-id-type="doi">10.1017/cts.2017.312</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Larson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Foe</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lally</surname><given-names>R</given-names> </name></person-group><article-title>Reading level and length of written research consent forms</article-title><source>Clin Transl Sci</source><year>2015</year><month>08</month><volume>8</volume><issue>4</issue><fpage>355</fpage><lpage>356</lpage><pub-id pub-id-type="doi">10.1111/cts.12253</pub-id><pub-id pub-id-type="medline">25580939</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sherlock</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brownie</surname><given-names>S</given-names> </name></person-group><article-title>Patients&#x2019; recollection and understanding of informed consent: a literature review</article-title><source>ANZ J Surg</source><year>2014</year><month>04</month><volume>84</volume><issue>4</issue><fpage>207</fpage><lpage>210</lpage><pub-id pub-id-type="doi">10.1111/ans.12555</pub-id><pub-id pub-id-type="medline">24812707</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Malik</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kuo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yip</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mejia</surname><given-names>A</given-names> </name></person-group><article-title>How well informed is the informed consent for cancer clinical trials?</article-title><source>Clin Trials</source><year>2014</year><month>12</month><volume>11</volume><issue>6</issue><fpage>686</fpage><lpage>688</lpage><pub-id pub-id-type="doi">10.1177/1740774514548734</pub-id><pub-id pub-id-type="medline">25135910</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zai</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Faro</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Allison</surname><given-names>J</given-names> </name></person-group><article-title>Unveiling readability challenges: an extensive analysis of consent document accessibility in clinical trials</article-title><source>J Clin Transl Sci</source><year>2024</year><volume>8</volume><issue>1</issue><fpage>e125</fpage><pub-id pub-id-type="doi">10.1017/cts.2024.595</pub-id><pub-id pub-id-type="medline">39345692</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Office for Human Research Protections</collab></person-group><article-title>Attachment C&#x2014;new &#x201C;key information&#x201D; informed consent requirements</article-title><source>US Department of Health and Human Services</source><year>2018</year><access-date>2025-01-31</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.hhs.gov/ohrp/sachrp-committee/recommendations/attachment-c-november-13-2018/index.html">https://www.hhs.gov/ohrp/sachrp-committee/recommendations/attachment-c-november-13-2018/index.html</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Improving large language models for clinical named entity recognition via prompt engineering</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1812</fpage><lpage>1820</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad259</pub-id><pub-id pub-id-type="medline">38281112</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nievas</surname><given-names>M</given-names> </name><name name-style="western"><surname>Basu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>H</given-names> </name></person-group><article-title>Distilling large language models for matching patients to clinical trials</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1953</fpage><lpage>1963</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae073</pub-id><pub-id pub-id-type="medline">38641416</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Unlu</surname><given-names>O</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mailly</surname><given-names>CJ</given-names> </name><etal/></person-group><article-title>Retrieval augmented generation enabled generative pre-trained transformer 4 (GPT-4) performance for clinical trial screening</article-title><source>medRxiv</source><comment>Preprint posted online on  Feb 8, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.02.08.24302376</pub-id><pub-id pub-id-type="medline">38370719</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Evaluating the ChatGPT family of models for biomedical reasoning and classification</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>04</month><day>3</day><volume>31</volume><issue>4</issue><fpage>940</fpage><lpage>948</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad256</pub-id><pub-id pub-id-type="medline">38261400</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Timsina</surname><given-names>P</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Evaluating the accuracy of a state-of-the-art large language model for prediction of admissions from the emergency room</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1921</fpage><lpage>1928</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae103</pub-id><pub-id pub-id-type="medline">38771093</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tripathi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sukumaran</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>TS</given-names> </name></person-group><article-title>Efficient healthcare with large language models: optimizing clinical workflow and enhancing patient care</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>05</month><day>20</day><volume>31</volume><issue>6</issue><fpage>1436</fpage><lpage>1440</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad258</pub-id><pub-id pub-id-type="medline">38273739</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Decker</surname><given-names>H</given-names> </name><name name-style="western"><surname>Trang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ramirez</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model-based chatbot vs surgeon-generated informed consent documentation for common procedures</article-title><source>JAMA Netw Open</source><year>2023</year><month>10</month><day>2</day><volume>6</volume><issue>10</issue><fpage>e2336997</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.36997</pub-id><pub-id pub-id-type="medline">37812419</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><source>Mistral AI</source><year>2024</year><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://mistral.ai">https://mistral.ai</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Cheaper, better, faster, stronger</article-title><source>Mistral AI</source><year>2024</year><month>04</month><day>17</day><access-date>2025-01-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://mistral.ai/news/mixtral-8x22b">https://mistral.ai/news/mixtral-8x22b</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wiggins</surname><given-names>G</given-names> </name><name name-style="western"><surname>McTighe</surname><given-names>J</given-names> </name></person-group><source>Understanding by Design</source><year>2005</year><edition>2</edition><publisher-name>Association for Supervision and Curriculum Development ASCD</publisher-name><pub-id pub-id-type="other">1416600353</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>Quick Safety 21: informed consent: more than getting a signature</article-title><source>The Joint Commission</source><year>2023</year><access-date>2025-01-31</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.jointcommission.org/resources/news-and-multimedia/newsletters/newsletters/quick-safety/quick-safety--issue-21-informed--consent-more-than-getting-a-signature/informed-consent-more-than-getting-a-signature/">https://www.jointcommission.org/resources/news-and-multimedia/newsletters/newsletters/quick-safety/quick-safety--issue-21-informed--consent-more-than-getting-a-signature/informed-consent-more-than-getting-a-signature/</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Informed consent</article-title><source>Leapfrog Ratings</source><access-date>2025-01-31</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ratings.leapfroggroup.org/measure/hospital/2023/informed-consent">https://ratings.leapfroggroup.org/measure/hospital/2023/informed-consent</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Samaan</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Yeo</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Rajeev</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Assessing the accuracy of responses by the language model ChatGPT to questions regarding bariatric surgery</article-title><source>Obes Surg</source><year>2023</year><month>06</month><volume>33</volume><issue>6</issue><fpage>1790</fpage><lpage>1796</lpage><pub-id pub-id-type="doi">10.1007/s11695-023-06603-5</pub-id><pub-id pub-id-type="medline">37106269</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Informed consent</article-title><source>American College of Surgeons</source><year>2023</year><month>07</month><access-date>2025-01-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.facs.org/for-patients/patient-resources/informed-consent">https://www.facs.org/for-patients/patient-resources/informed-consent</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kurtz-Rossi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Okonkwo</surname><given-names>IA</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Development of a new tool for writing research key information in plain language</article-title><source>Health Lit Res Pract</source><year>2024</year><month>01</month><volume>8</volume><issue>1</issue><fpage>e30</fpage><lpage>e37</lpage><pub-id pub-id-type="doi">10.3928/24748307-20240218-01</pub-id><pub-id pub-id-type="medline">38466225</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Li</surname><given-names>MY</given-names> </name></person-group><article-title>A guideline of selecting and reporting intraclass correlation coefficients for reliability research</article-title><source>J Chiropr Med</source><year>2016</year><month>06</month><volume>15</volume><issue>2</issue><fpage>155</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.jcm.2016.02.012</pub-id><pub-id pub-id-type="medline">27330520</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Mistral AI API (002)</article-title><source>Mistral AI</source><year>2024</year><month>05</month><day>14</day><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.mistral.ai/api">https://docs.mistral.ai/api</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><article-title>How should I set the temperature parameter?</article-title><source>OpenAI Platform</source><year>2024</year><month>08</month><day>6</day><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/guides/text-generation/how-should-i-set-the-temperature-parameter">https://platform.openai.com/docs/guides/text-generation/how-should-i-set-the-temperature-parameter</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  May 28, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sch&#x00E4;rli</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Least-to-most prompting enables complex reasoning in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  May 21, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2205.10625</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>Six strategies for getting better results</article-title><source>OpenAI Platform</source><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/guides/prompt-engineering/six-strategies-for-getting-better-results">https://platform.openai.com/docs/guides/prompt-engineering/six-strategies-for-getting-better-results</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Prompting capabilities</article-title><source>Mistral AI</source><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.mistral.ai/guides/prompting_capabilities">https://docs.mistral.ai/guides/prompting_capabilities</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>GPT-4o mini: advancing cost-efficient intelligence</article-title><source>OpenAI</source><year>2024</year><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence">https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>Mistral large 2</article-title><source>Mistral AI</source><year>2024</year><month>07</month><day>24</day><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://mistral.ai/news/mistral-large-2407">https://mistral.ai/news/mistral-large-2407</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>Introducing Llama 31: our most capable models to date</article-title><source>Meta</source><year>2024</year><access-date>2025-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ai.meta.com/blog/meta-llama-3-1">https://ai.meta.com/blog/meta-llama-3-1</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompt details.</p><media xlink:href="medinform_v13i1e68139_app1.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Form for grading scale of informed consent key information sections.</p><media xlink:href="medinform_v13i1e68139_app2.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material></app-group></back></article>