<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e82545</article-id><article-id pub-id-type="doi">10.2196/82545</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Clinical Note Generation From Doctor-Patient Conversations Using Parameter-Efficient Fine-Tuning Large Language Models: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ahmed</surname><given-names>Saib</given-names></name><degrees>BSc, MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yousuf Sadeque</surname><given-names>Farig</given-names></name><degrees>BSc, PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Department of Computer Science &#x0026; Engineering, BRAC University</institution><addr-line>Kha 224 Pragati Sarani, Merul Badda</addr-line><addr-line>Dhaka</addr-line><country>Bangladesh</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Anup</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Triep</surname><given-names>Karen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Usuemerai</surname><given-names>Precious</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Rui-qi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhan</surname><given-names>Zaifu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Saib Ahmed, BSc, MSc, Department of Computer Science &#x0026; Engineering, BRAC University, Kha 224 Pragati Sarani, Merul Badda, Dhaka, 1212, Bangladesh, 880 1796965173; <email>saibahmed7@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>3</day><month>6</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e82545</elocation-id><history><date date-type="received"><day>17</day><month>08</month><year>2025</year></date><date date-type="rev-recd"><day>03</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>13</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Saib Ahmed, Farig Yousuf Sadeque. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 3.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e82545"/><abstract><sec><title>Background</title><p>Clinical note documentation is a vital yet time-intensive task in health care. While advancements in natural language processing have transformed many domains, generating accurate summaries of doctor-patient conversations remains underexplored due to the limited availability of open-source datasets. Large language models (LLMs), with their training on vast datasets, present a promising solution to this challenge.</p></sec><sec><title>Objective</title><p>Precision in clinical summarization is crucial, as it directly impacts patient care and safety. This study aimed to evaluate the effectiveness of parameter-efficient, fine-tuned, decoder-only LLMs for clinical note generation from doctor-patient conversations. We focus on assessing medical accuracy, robustness, and the feasibility of parameter-efficient fine-tuning (PEFT) approaches under practical resource constraints.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used the Medical Training Summarization Dialog dataset containing 1700 doctor-patient conversations paired with clinical notes. Several decoder-only LLMs, including Mistral, Meditron, and Llama, were fine-tuned using PEFT techniques to reduce computational and memory overhead. Evaluation was performed using standard automatic metrics, including the Recall-Oriented Understudy for Gisting Evaluation score and bidirectional encoder representations from transformers score, to assess content overlap and semantic similarity between generated and reference clinical notes. In addition, an expert physician assessed the LLM-generated notes for medical accuracy, completeness, concision, relevance, and clinical coherence and readability.</p></sec><sec sec-type="results"><title>Results</title><p>Model performance was evaluated using the Recall-Oriented Understudy for Gisting Evaluation score and bidirectional encoder representations from transformers scores, demonstrating that Meditron-7B and Llama3-8B achieved state-of-the-art results among open-source, parameter-efficient, fine-tuned models, with Mistral-7B also performing competitively. The findings indicate that decoder-only LLMs, particularly Llama variants, outperform traditional models. Moreover, fine-tuning with higher quantization has the potential to further enhance performance. Human expert evaluation further indicated that Llama3-8B and Mistral-7B produced clinically coherent and accurate summaries, with Meditron-7B and Llama3-3B also performing reliably across evaluation criteria. The findings suggest that higher quantization during fine-tuning may improve efficiency without substantially compromising performance.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study underscores the potential of the PEFT of decoder-only LLMs to transform clinical workflows by streamlining medical documentation, thereby enabling health care professionals to dedicate more time to patient care. These models offer a scalable and resource-efficient alternative to traditional architectures and have the potential to streamline clinical documentation workflows.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>clinical natural language processing</kwd><kwd>clinical NLP</kwd><kwd>Dialogue2Note</kwd><kwd>transformer</kwd><kwd>decoder-only</kwd><kwd>Mistral</kwd><kwd>Llama</kwd><kwd>Meditron</kwd><kwd>summarization</kwd><kwd>Recall-Oriented Understudy for Gisting Evaluation</kwd><kwd>Recall-Oriented Understudy for Gisting Evaluation score</kwd><kwd>ROUGE score</kwd><kwd>bidirectional encoder representations from transformers</kwd><kwd>bidirectional encoder representations from transformers score</kwd><kwd>BERTScore</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Manually creating clinical notes has always been a time-consuming and exhausting task for health care providers. As health care systems grow increasingly complex and large-scale, the need for faster and more accurate documentation methods has become more pressing. Transformer architecture has brought significant advancements to various natural language processing (NLP) tasks, including text summarization&#x2014;a fundamental task in NLP. These advancements, driven by transformer-based large language models (LLMs) and the availability of large-scale datasets, have the potential to revolutionize health care systems. An NLP-powered system can analyze doctor-patient conversations, identify relevant clinical facts, structure the information, and generate coherent medical reports. By automating the generation of clinical notes, such systems provide timely insights and support to medical professionals during patient interactions. Real-time information retrieval ensures clinicians have immediate access to relevant medical data and patient histories, which can aid in making critical decisions. This, in turn, leads to more accurate diagnoses, personalized treatment strategies, and improved patient outcomes. Beyond individual patient interactions, the ability to analyze data at scale enables medical facilities to make data-driven decisions that enhance overall treatment quality, optimize resource use, and improve patient satisfaction. These innovations promise to streamline health care processes and elevate the standard of care.</p><p>The challenge lies in ensuring these automated notes are precise. Any mistakes or <italic>hallucinations</italic> in medical facts could have serious consequences. Summarizing clinical dialogues is tricky, but we tackled this by using decoder-only transformer models, which we found outperform traditional sequence-to-sequence models (such as Flan-T5-Large) on metrics such as Recall-Oriented Understudy for Gisting Evaluation (ROUGE) and bidirectional encoder representations from transformers (BERT) scores. Using the Medical Training Summarization Dialog (MTS-Dialog) dataset [<xref ref-type="bibr" rid="ref1">1</xref>] containing 1700 doctor-patient conversations and their summaries, we explored models such as Mistral [<xref ref-type="bibr" rid="ref2">2</xref>] and Llama [<xref ref-type="bibr" rid="ref3">3</xref>]<bold>,</bold> with Llama3 [<xref ref-type="bibr" rid="ref4">4</xref>] emerging as the top performer, even beating the best results from the 2023 MEDIQA-Chat challenge [<xref ref-type="bibr" rid="ref5">5</xref>].</p></sec><sec id="s1-2"><title>Motivation</title><p>Manual note-taking can be time-consuming, diverting health care providers&#x2019; attention from patient care. On average, physicians dedicate approximately 52 to 102 minutes each day documenting clinical notes based on their patient interactions [<xref ref-type="bibr" rid="ref6">6</xref>]. Automatic clinical note generation can be a solution to this problem. It can reduce the burden of paperwork on health care providers and improve the accuracy of the medical records. This allows doctors to focus more on patient care rather than on paperwork. During the COVID-19 pandemic, face-to-face doctor visits were restricted. For that reason, health care systems experienced over a 100% surge in virtual urgent care appointments and more than a 4000% rise in virtual nonurgent care visits [<xref ref-type="bibr" rid="ref7">7</xref>]. Automatic clinical note generation can help us to overcome this kind of situation. Clinical notes can vary widely in terms of content, format, and quality. Automated systems can help to standardize documentation, improve data quality, and facilitate analysis. Nevertheless, automated systems can extract valuable insights from clinical notes, enabling data-driven decision-making and improving patient care.</p></sec><sec id="s1-3"><title>Research Objective</title><p>Our research set out to fine-tune the LLMs to craft high-quality clinical notes that make a real difference. We aimed to find a model that balances speed and precision, fine-tuning it on MTS-Dialog to adapt to the dataset&#x2019;s unique demands. Our ambition was to go beyond the current leader, Flan-T5-Large, and establish a new benchmark for automated documentation. To measure our success, we compared our model&#x2019;s ROUGE and BERT scores against Flan-T5-Large, assessing its ability to summarize accurately while preserving every essential detail.</p></sec><sec id="s1-4"><title>Literature Review</title><p>The study by Ben Abacha et al [<xref ref-type="bibr" rid="ref1">1</xref>] introduced the MTS-Dialog dataset, pairing simulated doctor-patient dialogues with clinical notes. They tested transformer models such as Bidirectional and Auto-Regressive Transformers (BART) [<xref ref-type="bibr" rid="ref8">8</xref>] and Pegasus [<xref ref-type="bibr" rid="ref9">9</xref>], finding BART, especially when prefinetuned and guided, produced the most accurate notes. However, issues such as hallucinations and missing key details persisted, highlighting both the promise of automation in health care documentation and the need for better factual accuracy. The study by Ozler and Bethard [<xref ref-type="bibr" rid="ref10">10</xref>] explored LLMs for summarizing medical dialogues in the MEDIQA-Chat 2023 competition. Using models such as Clinical-T5 and Roberta-base on Medical Information Mart for Intensive Care datasets, they hit a peak accuracy of 72.3%. Limited hardware and dataset size posed challenges, but their work shows LLMs&#x2019; potential for medical documentation, with room for improvement through advanced models and techniques. The study by Sharma et al [<xref ref-type="bibr" rid="ref11">11</xref>] tackled the same competition, fine-tuning BART-large on datasets such as Medical Information Mart for Intensive Care-IV-Note and introducing an N-pass strategy to summarize long dialogues. Data augmentation (DA) with synthetic dialogues boosted results, though hallucinations remained an issue. Their research pushes clinical NLP forward, suggesting future integration of medical knowledge. The study by Wang et al [<xref ref-type="bibr" rid="ref12">12</xref>] used ChatGPT and BioMedLM in a doctor-patient loop system for MEDIQA-Chat 2023, excelling in dialogue generation and note summarization. While effective, gaps in medical knowledge and handling lengthy conversations need work, pointing to future refinements in segmentation and expertise. The study by Suri et al [<xref ref-type="bibr" rid="ref13">13</xref>] evaluated transformer models such as Bio-Bart and DialogLED for the same challenge. DialogLED-Large outperformed GPT-3, offering a cost-effective alternative despite limited training data. They recommend DA to improve reliability. The study by Tang et al [<xref ref-type="bibr" rid="ref14">14</xref>] fine-tuned BART and CONFIT while leveraging GPT-4 for MEDIQA-Chat 2023. GPT-4&#x2019;s natural outputs impressed human experts, though privacy concerns with external application programming interfaces surfaced. Their findings underscore LLMs&#x2019; role in streamlining clinical notes. The study by Srivastava [<xref ref-type="bibr" rid="ref15">15</xref>] tested local-sparse-global BART ensemble methods, finding that section-wise models outperformed single approaches for chart note summaries. Multilayer techniques and PubMed fine-tuning fell short, suggesting specialization as a key focus moving forward. The study by Milintsevich and Agarwal [<xref ref-type="bibr" rid="ref16">16</xref>] fine-tuned FLAN-T5 and LongT5, using multitask learning to cut hallucinations in clinical notes. While effective, adding clinical named entity recognition tags unexpectedly hurt quality, indicating a need for better augmentation strategies. The study by Zhang et al [<xref ref-type="bibr" rid="ref17">17</xref>] combined an SVM classifier with GPT-3 for summarization, with GPT-3.5 outperforming a fine-tuned Curie model. Their hybrid approach hints at the power of blending traditional and modern techniques. Finally, the study by Mathur et al [<xref ref-type="bibr" rid="ref18">18</xref>] used GPT-4 with in-context examples, topping MEDIQA 2023 rankings. Despite concise outputs, brevity and privacy risks remain challenges, marking a step forward in applying LLMs to health care. In another research, the study by Heilmeyer et al [<xref ref-type="bibr" rid="ref19">19</xref>] demonstrated that hospitals can use smaller, open-source LLMs running on their own computers to handle sensitive documentation, rather than relying on commercial systems. They found that a model specifically optimized for the local language (German) performed exceptionally well, generating medical reports that doctors rated as usable 93% of the time. The study by Savage et al [<xref ref-type="bibr" rid="ref20">20</xref>] showed that while basic fine-tuning is enough for simple medical checklist tasks, using the more advanced direct preference optimization method allows LLMs to handle complex clinical reasoning and triage by teaching them to distinguish between high-quality and poor responses.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study uses the MTS-Dialog dataset [<xref ref-type="bibr" rid="ref1">1</xref>], a publicly available collection of simulated doctor-patient conversations. As the dataset contains no protected health information or real patient interactions, this research does not classify as human participants research. A qualified physician participated in this study to evaluate the clinical accuracy of the generated notes. This involvement was strictly in a professional capacity as a domain expert to assess text quality, rather than as a research participant. No personal data were collected from the evaluator.</p></sec><sec id="s2-2"><title>Main Dataset</title><sec id="s2-2-1"><title>Overview</title><p>There are around 1700 brief doctor-patient conversations paired with clinical notes and summaries in the MTS-Dialog dataset. The main aim behind creating this dataset is to help researchers create tools that automatically summarize doctor-patient conversations and generate clinical notes [<xref ref-type="bibr" rid="ref1">1</xref>]. The training set comprised 1201 pairs for model training. The validation set comprised 100 pairs for fine-tuning. The test sets comprised 2 sets of 200 pairs each: test set 1 (used in MEDIQA-Chat 2023, task A) and test set 2 (used in MEDIQA-Sum 2023, tasks A and B). MEDIQA-Chat 2023 tasks encompassed task A: predict section headers (eg, HISTORY of PRESENT ILLNESS) and content from short dialogues, task B: generate complete clinical notes from full conversations, and task C: create synthetic dialogues from clinical notes. MEDIQA-Sum 2023 tasks encompassed task A: generate clinical note summaries from dialogues and task B: produce section-specific summaries (eg, ASSESSMENT and PLAN).</p></sec><sec id="s2-2-2"><title>Section-Header Categories</title><p>The MTS-Dialog dataset is divided into 20 categories of section headers: fam/sochx, genhx, pastmedicalhx, cc, pastsurgical, allergy, ros, medications, assessment, exam, diagnosis, disposition, plan, edcourse, immunizations, imaging, gynhx, procedures, other_history, and labs. The statistics of the dataset shared in the paper [<xref ref-type="bibr" rid="ref1">1</xref>] can be found in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Descriptive statistics of the Medical Training Summarization Dialog dataset.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Count</td><td align="left" valign="bottom">Values, mean (SD)</td><td align="left" valign="bottom">Values, maximum</td><td align="left" valign="bottom">25th percentile</td><td align="left" valign="bottom">50th percentile</td><td align="left" valign="bottom">75th percentile</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Dialogue</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Turns</td><td align="left" valign="top">15,969</td><td align="left" valign="top">9 (8.72)</td><td align="left" valign="top">103</td><td align="left" valign="top">4</td><td align="left" valign="top">6</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sentences</td><td align="left" valign="top">18,406</td><td align="left" valign="top">11 (13.03)</td><td align="left" valign="top">136</td><td align="left" valign="top">4</td><td align="left" valign="top">7</td><td align="left" valign="top">14</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words</td><td align="left" valign="top">2,41,685</td><td align="left" valign="top">142 (144.03)</td><td align="left" valign="top">1951</td><td align="left" valign="top">48</td><td align="left" valign="top">88</td><td align="left" valign="top">176</td></tr><tr><td align="left" valign="top" colspan="7">Summary</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sentences</td><td align="left" valign="top">5870</td><td align="left" valign="top">3 (4.35)</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words</td><td align="left" valign="top">81,299</td><td align="left" valign="top">48 (72.02)</td><td align="left" valign="top">48</td><td align="left" valign="top">6</td><td align="left" valign="top">18</td><td align="left" valign="top">55</td></tr></tbody></table></table-wrap></sec><sec id="s2-2-3"><title>Data Quality</title><p>The MTS-DIALOG dataset undergoes a thorough 3-step process to ensure its quality. First, only those with medical backgrounds, such as former medical scribes, were selected to serve as annotators. Second, during the early stages of their work, each annotator received one-on-one feedback from an experienced trainer to help refine their skills. Finally, after the dataset was completed, an independent validation process took place. This separate review used a grading rubric to assess how well the annotated conversations followed the guidelines and how relevant the content was to the original clinical notes. Minor corrections, such as fixing typos or filling in missing information, were made during this stage to make sure the final dataset was even more accurate than the initial version [<xref ref-type="bibr" rid="ref5">5</xref>].</p></sec><sec id="s2-2-4"><title>Comparison With Real Data</title><p>The MTS-Dialog dataset includes both real medical notes and simulated conversations that mirror doctor-patient interactions, helping to avoid any breaches of confidentiality. To understand the impact of relying heavily on synthetic data, a blind review was conducted to compare the MTS-Dialog data with real conversations. Distinguishing between the simulated and real data in the dataset is a challenging task. While statistical analysis shows that the simulated conversations have fewer speech errors and pauses, medical experts noted that the dialogues generally feel authentic. In some cases, the clarity, directness, and ease of understanding, even with sudden shifts in topics, made synthetic data mistaken for real interactions. On the other hand, actual data, known for their honesty and minimal speech flaws, were often confused for simulated content due to its polished nature. This difficulty highlights the dataset&#x2019;s value as a foundation for training and evaluating models in practical, real-world settings.</p><p>Back-translation augmentation stands out as a valuable method. It involves converting the original text into another language and then translating it back into the original language. This process introduces natural linguistic variations while preserving the core meaning, thus expanding the training dataset and helping models generalize better to unseen data. To reduce translation errors, French and Spanish were chosen because their vocabulary is similar to English, and they have high-performing translation models [<xref ref-type="bibr" rid="ref21">21</xref>]. It was implemented using the following three steps: (1) translation: the original text is translated from its source language (English) to a target language (French and Spanish) using a machine translation model, (2) back-translation: the translated text is then translated back to the original language (English) using another machine translation model, and (3) augmentation: the back-translated text is added to the original training dataset, creating a larger and more diverse corpus.</p><p>After DA, the dataset includes 3.6k pairs of medical dialogues and their corresponding summaries, generated from an initial 1200 training pairs through back-translation using French and Spanish, as outlined in the study [<xref ref-type="bibr" rid="ref1">1</xref>]. Back-translation can significantly increase the size of a training dataset. Theoretically, it can improve the performance of the summarization model. By exposing models to different linguistic variations, back-translation can help them generalize unseen data better. Nevertheless, by increasing the diversity of training data, back-translation helps prevent overfitting, which is a common problem in NLP.</p></sec></sec><sec id="s2-3"><title>Fine-Tuning Techniques</title><p>In most cases, the graphics processing unit (GPU) hardly has enough memory to fine-tune any decoder-only LLM. To overcome this problem, we used the parameter-efficient fine-tuning (PEFT) technique proposed in the study by Houlsby et al [<xref ref-type="bibr" rid="ref22">22</xref>]. The authors in the paper proposed a parameter-efficient transfer learning technique for NLP that introduces small trainable adapter modules into every layer of a pretrained model with frozen original model weights. The technique minimizes trainable parameters, leading to efficient and scalable fine-tuning across various tasks. We also used 8-bit quantization, where the original pretrained weights of the model are quantized to 8-bit and kept fixed during fine-tuning. This method is known as quantized low-rank adapter (QLoRA) [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s2-4"><title>Research Methodology</title><sec id="s2-4-1"><title>Overview</title><p>This research introduces a new task in text generation. To solve this, a text-generation model must be created to generate clinical notes from the conversation between doctor and patient <xref ref-type="other" rid="box1">Textbox 1</xref>. A training-inference diagram of this text-generation model is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><boxed-text id="box1"><title> Algorithm 1: training procedure for clinical note generation.</title><p><bold>Given dataset:</bold> <italic>C,N, where C={C<sub>0</sub>,C<sub>1</sub>,...,C<sub>i</sub>} [set of doctor-patient conversations] and N={N<sub>0</sub>,N<sub>1</sub>,...,N<sub>i</sub>} [set of clinical notes]</italic></p><p/><p><bold>Objective:</bold> learn a generative model <italic>F</italic> (<italic>C<sub>j</sub></italic>) such that it produces a valid <italic>N<sub>j</sub></italic>, where <italic>N<sub>j</sub></italic> &#x2209; {<italic>N<sub>0</sub></italic>,<italic>N<sub>1</sub></italic>,...,<italic>N<sub>i</sub></italic>} and <italic>N<sub>j</sub></italic> is not generated from {<italic>C<sub>0</sub></italic>,<italic>C<sub>1</sub></italic>,...,<italic>C<sub>i</sub></italic>}</p><p>A valid generated note <italic>N<sub>k</sub></italic> must follow the syntax of the target language, maintain clinical integrity, follow the semantics of the target language, and not be a hallucination.</p></boxed-text><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Training and inference diagram of the clinical note generation model. LLM: large language model; PEFT: parameter-efficient fine-tuning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e82545_fig01.png"/></fig></sec><sec id="s2-4-2"><title>Data Preprocessing</title><p>The dataset was passed through the following preprocessing steps:</p><list list-type="bullet"><list-item><p>Removing unnecessary spaces: some unnecessary tags in the dataset were removed by an empty string. Certain spaces and line gaps were eliminated for text processing and analysis to streamline the training.</p></list-item><list-item><p>Tokenization: tokenization is one of the most vital steps in this research. In this particular research, HuggingFace models were mostly used. For this reason, HuggingFace&#x2019;s AutoTokenizer class is used for tokenization.</p></list-item></list></sec><sec id="s2-4-3"><title>Evaluation Metrics</title><p>To ensure a robust assessment of the generated clinical notes, we used both automated quantitative metrics and qualitative human expert evaluation:</p><list list-type="bullet"><list-item><p>ROUGE: this metric measures the lexical overlap between the machine-generated notes and the reference summaries. We report ROUGE-1 (unigram overlap), ROUGE-2 (bigram overlap), and ROUGE-L (longest common subsequence) [<xref ref-type="bibr" rid="ref24">24</xref>].</p></list-item><list-item><p>BERTScore: This metric evaluates semantic similarity by leveraging contextual embeddings to compare the generated text with the reference text, capturing meaning beyond exact word matches [<xref ref-type="bibr" rid="ref25">25</xref>].</p></list-item><list-item><p>To validate clinical utility, a qualified physician evaluated a subset of the generated notes. The evaluation followed a 5-point Likert scale (where 1 is the lowest and 5 is the highest) across five critical clinical dimensions: (1) medical accuracy: verification of the correctness of clinical facts, (2) completeness: ensuring no vital information from the dialogue was omitted, (3) conciseness and relevance: assessing the model&#x2019;s ability to filter out nonessential information, (4) clinical coherence and readability: evaluating the logical flow and professional tone of the note, and (5) overall clinical quality: an aggregate score representing the general utility of the note in a professional setting.</p></list-item></list></sec><sec id="s2-4-4"><title>Training Setup</title><p>We used the online platform Kaggle to fine-tune our dataset. NVIDIA Tesla T4 GPU was used for both the computationally intensive fine-tuning process and the inference phase. The hardware configuration was as follows: Tesla T4, CUDA version: 12.4, and available RAM: 29 GB.</p></sec><sec id="s2-4-5"><title>Fine-Tuning the Llama and Mistral Variants</title><p>At first, we tried different sequence-to-sequence models with DA techniques to beat the current state-of-the-art (SOTA) model, but the result was not satisfactory. Next, several SOTA decoder-only models, such as variants of Mistral and Llama, were evaluated. The &#x201C;Meta-Llama-3-8B&#x201D; model, an updated version of the Llama family with 8B parameters, outperformed the SOTA Flan-T5-Large model in the ROUGE and BERT metrics, while the &#x201C;Mistral-7B-v0.3&#x201D; outperformed in the BERT metric only. A smaller variant of the Llama family &#x201C;Llama-3.2-3B&#x201D; was also fine-tuned. Due to low hardware resources, full model fine-tuning was not feasible. We have also used &#x201C;Meditron-7B.&#x201D; It is a suite of open-source medical language models with 7B and 70B parameters, addressing gaps in medical artificial intelligence [<xref ref-type="bibr" rid="ref26">26</xref>]. Built on Llama-2, the models are pretrained on a curated medical corpus, including PubMed articles and guidelines, using Nvidia&#x2019;s Megatron-LM framework. We used the 7B variant of the model in this research. The decoder-only models were fine-tuned using PEFT [<xref ref-type="bibr" rid="ref22">22</xref>] with the low-rank adaptation (LoRA) [<xref ref-type="bibr" rid="ref27">27</xref>] method. Eight-bit quantization was used while loading the models. This method is known as QLORA [<xref ref-type="bibr" rid="ref23">23</xref>]. In this approach, the model&#x2019;s original pretrained weights are converted to 8-bit formats and remain unchanged during fine-tuning. A small number of trainable parameters, called low-rank adapters, are added during the fine-tuning process [<xref ref-type="bibr" rid="ref27">27</xref>]. These adapters are trained to adjust the pretrained model for the specific task during fine-tuning using the 32-bit floating-point format. During computations, such as forward and backward passes in training or during inference, the 8-bit quantized weights are converted back to 32-bit floating-point numbers. After fine-tuning, the model includes the original weights in their 8-bit form, along with the low-rank adapters in their higher-precision format. A significant percentage of this research and analysis is devoted to fine-tuning. For this procedure, the MTS-Dialog dataset was used. Meta-Llama-3-8B, Llama-3.2-3B, Meditron-7B, and Mistral-7B-v0.3 LLMs were used for fine-tuning with the following hyperparameters. All the models were loaded with 8-bit quantization, and the following LoRA hyperparameter configurations were maintained and described in <xref ref-type="table" rid="table2">Table 2</xref>. The prompt that was used as a prefix to summarize the doctor-patient dialogue to generate clinical notes was as follows:</p><disp-quote><p>Summarize the following patient-doctor dialogue. Include all medically relevant information, including family history, diagnosis, past medical (and surgical) history, immunizations, lab results, and known allergies.</p></disp-quote><p>This primary prompt was used to generate all quantitative performance metrics reported in <xref ref-type="table" rid="table3">Table 3</xref>. For the evaluation of our fine-tuned models, we calculated performance metrics independently for test set 1 and test set 2. These independent scores were subsequently averaged to ensure comparability with baseline models reported in the literature. Furthermore, we explicitly confirm that there is no data overlap between the training set and these test sets.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Hyperparameter settings used for fine-tuning the large language models, including parameters for training arguments and generation. Detailed configuration of the training environment and model parameters used to adapt Mistral, Llama, and Meditron variants for clinical note generation.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Hyperparameters</td><td align="left" valign="bottom">Settings</td></tr></thead><tbody><tr><td align="left" valign="top">per_device_train_batch_size</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">logging_steps</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top">warmup_steps</td><td align="left" valign="top">0.03</td></tr><tr><td align="left" valign="top">save_strategy</td><td align="left" valign="top">epoch</td></tr><tr><td align="left" valign="top">group_by_length</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">lr_scheduler_type</td><td align="left" valign="top">constant</td></tr><tr><td align="left" valign="top">max_seq_length</td><td align="left" valign="top">512</td></tr><tr><td align="left" valign="top">lora_alpha</td><td align="left" valign="top">16</td></tr><tr><td align="left" valign="top">lora_dropout</td><td align="left" valign="top">0.1</td></tr><tr><td align="left" valign="top">LoRA attention dimension (rank), r</td><td align="left" valign="top">64</td></tr><tr><td align="left" valign="top">target_modules</td><td align="left" valign="top">q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj, lm_head</td></tr><tr><td align="left" valign="top">bias</td><td align="left" valign="top">none</td></tr><tr><td align="left" valign="top">task_type CAUSAL_LM</td><td align="left" valign="top">CAUSAL_LM</td></tr><tr><td align="left" valign="top">max_new_tokens</td><td align="left" valign="top">512</td></tr><tr><td align="left" valign="top">do_sample</td><td align="left" valign="top">True</td></tr><tr><td align="left" valign="top">temperature</td><td align="left" valign="top">0.8</td></tr><tr><td align="left" valign="top">pad_token_id</td><td align="left" valign="top">tokenizer.eos_token_id</td></tr></tbody></table></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance metrics for fine-tuned large language models on the clinical note generation task, measured by Recall-Oriented Understudy for Gisting Evaluation (ROUGE) and bidirectional encoder representations from transformers score (BERTScore).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">ROUGE-1</td><td align="left" valign="bottom">ROUGE-2</td><td align="left" valign="bottom">ROUGE-L</td><td align="left" valign="bottom">BERTScore-F1</td></tr></thead><tbody><tr><td align="left" valign="top">Llama-3.2-3B</td><td align="left" valign="top">0.3686</td><td align="left" valign="top">0.1517</td><td align="left" valign="top">0.2895</td><td align="left" valign="top">0.8901</td></tr><tr><td align="left" valign="top">Llama-3-8B</td><td align="left" valign="top"><italic>0.4574<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></italic></td><td align="left" valign="top"><italic>0.2079</italic></td><td align="left" valign="top"><italic>0.3636</italic></td><td align="left" valign="top"><italic>0.9060</italic></td></tr><tr><td align="left" valign="top">Llama-3-8B (DA<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>)</td><td align="left" valign="top">0.4131</td><td align="left" valign="top">0.1888</td><td align="left" valign="top">0.3410</td><td align="left" valign="top">0.8956</td></tr><tr><td align="left" valign="top">Mistral-7B</td><td align="left" valign="top">0.3889</td><td align="left" valign="top">0.1431</td><td align="left" valign="top">0.3184</td><td align="left" valign="top">0.8933</td></tr><tr><td align="left" valign="top">Mistral-7B (DA)</td><td align="left" valign="top">0.3679</td><td align="left" valign="top">0.1112</td><td align="left" valign="top">0.3014</td><td align="left" valign="top">0.8898</td></tr><tr><td align="left" valign="top">Meditron-7B</td><td align="left" valign="top"><italic>0.4560</italic></td><td align="left" valign="top">0.1931</td><td align="left" valign="top"><italic>0.3667</italic></td><td align="left" valign="top"><italic>0.9056</italic></td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Best scores are italicized.</p></fn><fn id="table3fn2"><p><sup>b</sup>DA: data augmentation.</p></fn></table-wrap-foot></table-wrap><p>While training, the following &#x201C;TrainingArguments&#x201D; class&#x2019;s hyperparameter configuration was used, shown in <xref ref-type="table" rid="table2">Table 2</xref>. Given the severe computational and memory constraints of fine-tuning LLMs on a single Tesla T4 GPU, conducting an exhaustive algorithmic hyperparameter search (such as a grid search) was computationally prohibitive. Instead, we used an empirical, manual tuning strategy. We iteratively tested a targeted subset of hyperparameter combinations based on established best practices for PEFT. For example, we evaluated LoRA &#x03B1; values of 8, 16, and 32, observing empirically that an alpha of 16 provided the most stable validation loss and convergence speed for our dataset without triggering out-of-memory errors. Pytorch &#x201C;generate()&#x201D; method generates clinical notes from the fine-tuned LLMs. The hyperparameters used for generating clinical notes are also shown in <xref ref-type="table" rid="table2">Table 2</xref>. These same hyperparameters were maintained for all the models that are used in this research.</p><p>We used a maximum sequence length of 512 tokens due to hardware memory constraints. Though the dataset contains dialogues exceeding this length, truncation was applied to fit the inputs within this limit.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>The results are shown after fine-tuning the Llama3-8B and Mistral-7B models in <xref ref-type="table" rid="table3">Table 3</xref>. ROUGE [<xref ref-type="bibr" rid="ref24">24</xref>] and BERTScore [<xref ref-type="bibr" rid="ref25">25</xref>] were used as evaluation metrics. The performances with and without DA are displayed in the table. The fine-tuned Llama-3-8B and Meditron-7B models achieved comparable SOTA performance. Llama-3-8B attained a ROUGE-1 score of 0.4574 and BERTScore-F1 of 0.9060. Notably, Meditron-7B achieved a slightly higher ROUGE-L score (0.3667 vs 0.3636), suggesting strong performance in structural coherence, likely due to its domain-specific pretraining. Given the marginal differences in ROUGE-1 scores (0.0014 difference), we classify both models as top-tier performers among the open-source variants tested, rather than declaring a single definitive winner.</p><p>For some reason, Llama-3-8B with DA performs lower than this. With DA, we get a ROUGE-1 score of 0.4131, a ROUGE-2 score of 0.1888, a ROUGE-L score of 0.3410, and a BERTScore-F1 score of 0.8956. A smaller version of the Llama model, &#x201C;Llama-3-3B,&#x201D; is also fine-tuned, but the performance was not good enough, indicating that model parameter size remains a critical factor for this specific task. The model gave ROUGE-1 score of 0.3686 and BERTScore-F1 of 0.8901. The fine-tuned Mistral model also delivered competitive results, achieving a BERTScore-F1 of 0.8933. However, in the case of ROUGE scores, it lags behind Llama3. With DA, the performance of the Mistral-7B model decreased.</p><p>An expert physician evaluated the LLM-generated clinical notes using a 5-point Likert scale (1-5) across five dimensions: medical accuracy, completeness, conciseness and relevance, clinical coherence and readability, and overall clinical quality (average of all these criteria). Higher scores indicate better performance across all criteria. The human evaluation result is shown in the <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Physician-rated clinical quality evaluation of the large language model&#x2013;generated notes. A qualitative assessment conducted by a qualified domain expert to validate the clinical utility of the models. Five key dimensions were evaluated using a 5-point Likert scale: medical accuracy, completeness, conciseness and relevance, clinical coherence and readability, and overall clinical quality.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Medical accuracy, mean (SD)</td><td align="left" valign="bottom">Completeness, mean (SD)</td><td align="left" valign="bottom">Conciseness and relevance, mean (SD)</td><td align="left" valign="bottom">Clinical coherence and readability, mean (SD)</td><td align="left" valign="bottom">Overall clinical quality, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Llama-3.2-3B</td><td align="left" valign="top">4.29 (1.14)</td><td align="left" valign="top">4.38 (1.02)</td><td align="left" valign="top">4.43 (1.05)</td><td align="left" valign="top">4.65 (0.91)</td><td align="left" valign="top">4.44 (0.93)</td></tr><tr><td align="left" valign="top">Llama-3-8B</td><td align="left" valign="top">4.66 (0.86)</td><td align="left" valign="top">4.65 (0.89)</td><td align="left" valign="top">4.73 (0.82)</td><td align="left" valign="top">4.83 (0.74)</td><td align="left" valign="top">4.72 (0.75)</td></tr><tr><td align="left" valign="top">Mistral-7B</td><td align="left" valign="top">4.55 (0.68)</td><td align="left" valign="top">4.68 (0.60)</td><td align="left" valign="top">4.73 (0.60)</td><td align="left" valign="top">4.89 (0.42)</td><td align="left" valign="top">4.71 (0.51)</td></tr><tr><td align="left" valign="top">Meditron-7B</td><td align="left" valign="top">4.43 (0.90)</td><td align="left" valign="top">4.47 (0.90)</td><td align="left" valign="top">4.73 (0.55)</td><td align="left" valign="top">4.89 (0.35)</td><td align="left" valign="top">4.63 (0.52)</td></tr></tbody></table></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The results of this study demonstrate that decoder-only transformer models, specifically Llama3-8B and Meditron-7B, achieve SOTA performance among open-source, parameter-efficient, fine-tuned models in generating clinical notes from doctor-patient dialogues. Our findings indicate that PEFT with 8-bit quantization (QLoRA) allows these large models to perform effectively under hardware constraints while maintaining high medical accuracy. The results clearly show that DA did not enhance the performance of either model; both the Mistral and Llama variants performed better without it. A similar phenomenon has been noted and explained in a previous research paper [<xref ref-type="bibr" rid="ref28">28</xref>]. Notably, an expert physician&#x2019;s evaluation confirmed that these models produce clinically coherent and accurate summaries, with Llama3-8B and Mistral-7B leading in overall clinical quality.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>Our research establishes a new benchmark by surpassing the previous leader, Flan-T5-Large, in automatic evaluation metrics such as ROUGE and BERTScore. Unlike some entries in the MEDIQA-Chat 2023 competition that relied on DA to boost performance, our experiments showed that DA actually decreased performance for Mistral and Llama variants, aligning with findings in other specialized medical NLP research.</p></sec><sec id="s4-3"><title>Limitations</title><p><xref ref-type="table" rid="table5">Table 5</xref> shows the training and inference times for different models. Training time is measured in seconds/epoch. All fine-tuned models are inferred with a Tesla T4 GPU. From <xref ref-type="table" rid="table5">Table 5</xref> it is visible that the inference time for Mistral-7B is better than Llama-3-8B, but it is worse than Meditron-7B. However, for real-world applications, it needs to be improved further. Training time for both models is quite long.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Computational efficiency and latency analysis per clinical note generation. This table compares the training duration (seconds/epoch) and average inference latency (seconds/note) across the four primary model architectures. Conducted on a Tesla T4 graphics processing unit, these metrics evaluate the feasibility of deploying decoder-only large language models in real-time clinical settings where documentation speed is critical for reducing physician burnout.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Training time</td><td align="left" valign="bottom">Average inference time</td></tr></thead><tbody><tr><td align="left" valign="top">Llama-3.2-3B</td><td align="left" valign="top">1406.6</td><td align="left" valign="top">12.49</td></tr><tr><td align="left" valign="top">Llama-3-8B</td><td align="left" valign="top">2588.9</td><td align="left" valign="top">13.41</td></tr><tr><td align="left" valign="top">Mistral-7B</td><td align="left" valign="top">2171.6</td><td align="left" valign="top">8.68</td></tr><tr><td align="left" valign="top">Meditron-7B</td><td align="left" valign="top">2566.06</td><td align="left" valign="top">6.27</td></tr></tbody></table></table-wrap><p>All the fine-tuned models had some gender biases. As the &#x201C;MTS-Dialog&#x201D; dataset is a short doctor-patient conversation dataset, sometimes it is quite difficult to understand the patient&#x2019;s gender from the conversation. In these situations, all of the fine-tuned models mostly assume the patient&#x2019;s gender as male. In a few cases, the model predicts the gender as female, while in the annotation, the patient is identified as male. In some cases in the dataset, the gender pronoun is used incorrectly. We assume that most of the model&#x2019;s pretraining dataset could be biased, which is causing this type of problem. In some cases in the dataset, the gender pronoun is used incorrectly. To overcome this, we use the following new prompt during inference:</p><disp-quote><p>Summarize the following patient-doctor dialogue. To ensure a comprehensive summary, follow these steps:</p><p>Gender identification: identify the patient&#x2019;s gender based on the context and use appropriate pronouns throughout the summary.</p><p>Medical history: summarize the patient&#x2019;s family history, past medical and surgical history, and known allergies. Ensure each detail is categorized.</p><p>Current visit: identify the main concerns discussed, including symptoms, lab results, and diagnosis.</p><p>Immunizations: list any relevant immunization history.</p><p>Plan and recommendations: note any treatment plan, medications prescribed, or follow-up actions recommended by the doctor. Ensure your summary flows logically, preserving the order of the conversation, while focusing on medically relevant details.</p></disp-quote><p>However, even after using this prompt, there was still no significant improvement. This gender-aware prompt was evaluated qualitatively but is not reflected in these baseline quantitative results.</p><p>Another major limitation of our quantitative evaluation is the reliance on a single expert physician, which introduces potential evaluator bias and precludes the calculation of interrater reliability metrics such as Cohen &#x03BA;. Future studies should use multiple independent evaluators to mitigate individual bias.</p></sec><sec id="s4-4"><title>Future Work</title><p>The amount of data in the medical domain are very limited. It is also very hard to get access to this type of data because of the physician-patient privilege. Most of the patients are not comfortable sharing their private data. It is essential to develop more data, which will make it easier to create an automatic clinical note generation system. The dataset used in this research is short conversations between doctors and patients. A more and longer real-world doctor-patient conversation corpus is needed in the future to improve the quality of clinical note generation.</p><p>Domain-specific pretrained decoder-only LLMs have improved the domain-specific task a lot in recent years. A pretrained model such as Code Llama is one example in the coding domain [<xref ref-type="bibr" rid="ref29">29</xref>]. Developing a decoder-only model that is pretrained on a large medical corpus might help to create clinical note generation tasks.</p><p>In diverse health care settings, doctors and patients may converse in different languages. A cross-lingual summary can help bridge this language gap by automatically summarizing conversations in one language and translating the summary into another, making it accessible to a wider range of health care providers. Creating a cross-lingual clinical note generator could greatly impact this domain.</p><p>In this research, text data from doctor-patient conversations are used. Generating clinical notes directly from the spoken interactions between doctors and patients could provide a more accurate and practical solution.</p><p>The maximum sequence length was set to 512 tokens due to hardware memory constraints. This truncation represents a methodological limitation that may have affected the completeness of the generated notes for longer dialogues. Future implementations should consider sliding window approaches to handle lengthy conversations without information loss.</p><p>It is also shown that just updating the prompt while doing inference is not a solution to this problem. The hypothesis of the authors of this paper suggests these steps to overcome the problem. The wrongly addressed gender in the dataset should be corrected manually in the reference note, and the patient&#x2019;s gender information should be included in the conversation. After that, the prompt should be updated by including the information to detect the correct gender of the patient and use this prompt to fine-tune the model. By doing all these tasks, it may be possible to solve the problem.</p></sec><sec id="s4-5"><title>Conclusions</title><p>In this research, we experimented with various decoder-only transformer architectures to fine-tune models for generating clinical notes by summarizing conversations between doctors and patients. The results demonstrate that decoder-only models such as Llama3 and Mistral outperform classical encoder-decoder models such as Flan-T5 and Pegasus in summarizing medical discussions. Larger models tend to deliver better results compared to smaller ones. By fine-tuning the Llama-3-8B and Meditron-7B models, SOTA performance among open-source, parameter-efficient models was achieved in terms of ROUGE-1 score and BERTScore. However, none of the models underwent full fine-tuning. Instead, PEFT methods were applied across the board. The study highlights the importance of domain-specific pretraining and high-quality annotated datasets. While our methods show promising results, there are still challenges to overcome, such as hardware limitations, gender bias, and the lack of diverse medical datasets. Ultimately, automating clinical documentation through decoder-only LLMs has the potential to enhance health care efficiency. However, given the unresolved gender bias observed in this study, these models are currently best suited as assistive tools in human-in-the-loop workflows rather than autonomous systems, requiring further research into bias mitigation before they can ensure fully accurate patient records.</p></sec></sec></body><back><ack><p>The authors would like to express their gratitude to Dr Mazidul Islam Nahid (MBBS, PGT [ENT], DLO [Course]; BMDC RN: 129195) of Sir Salimullah Medical College and Mitford Hospital for his essential contribution as the domain expert. His clinical expertise was instrumental in evaluating the generated notes across five critical dimensions: medical accuracy, completeness, concision, relevance, and clinical coherence and readability.</p><p>All authors declared that they had insufficient or no funding to support open-access publication of this manuscript, including from affiliated organizations or institutions, funding agencies, or other organizations. JMIR Publications provided APF support for the publication of this article.</p><p>Generative artificial intelligence (ChatGPT and Gemini) was used solely for grammatical polishing and readability improvement, while the scientific concept and experimental design remain exclusively the work of the authors.</p></ack><notes><sec><title>Funding</title><p>No financial support was received for the research, authorship, and/or publication of this article.</p></sec><sec><title>Data Availability</title><p>The study used the Medical Training Summarization Dialog dataset, which is a publicly available collection of simulated doctor-patient conversations. It can be accessed through the official MEDIQA-Chat 2023 shared task repository.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: SA, FYS</p><p>Data curation: SA</p><p>Formal analysis: SA</p><p>Investigation: SA</p><p>Methodology: SA</p><p>Project administration: FYS</p><p>Software: SA</p><p>Supervision: FYS</p><p>Visualization: SA</p><p>Writing &#x2013; original draft: SA</p><p>Writing &#x2013; review &#x0026; editing: SA</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BART</term><def><p>Bidirectional and Auto-Regressive Transformers</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb3">DA</term><def><p>data augmentation</p></def></def-item><def-item><term id="abb4">GPU</term><def><p>graphics processing unit</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb7">MTS-Dialog</term><def><p>Medical Training Summarization Dialog</p></def></def-item><def-item><term id="abb8">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb9">PEFT</term><def><p>parameter-efficient fine-tuning</p></def></def-item><def-item><term id="abb10">QLoRA</term><def><p>quantized low-rank adapter</p></def></def-item><def-item><term id="abb11">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item><def-item><term id="abb12">SOTA</term><def><p>state-of-the-art</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ben Abacha</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yim</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>T</given-names> </name></person-group><article-title>An empirical study of clinical note generation from doctor-patient encounters</article-title><source>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><pub-id pub-id-type="doi">10.18653/v1/2023.eacl-main.168</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mistral 7B</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.06825</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lavril</surname><given-names>T</given-names> </name><name name-style="western"><surname>Izacard</surname><given-names>G</given-names> </name><etal/></person-group><article-title>LLaMA: open and efficient foundation language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 27, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.13971</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ben Abacha</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yim</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>G</given-names> </name><name name-style="western"><surname>Snider</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yetisgen</surname><given-names>M</given-names> </name></person-group><article-title>Overview of the MEDIQA-Chat 2023 shared tasks on the summarization &#x0026; generation of doctor-patient conversations</article-title><source>Proceedings of the 5th Clinical Natural Language Processing Workshop</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>503</fpage><lpage>513</lpage><pub-id pub-id-type="doi">10.18653/v1/2023.clinicalnlp-1.52</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hripcsak</surname><given-names>G</given-names> </name><name name-style="western"><surname>Vawdrey</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Fred</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Bostwick</surname><given-names>SB</given-names> </name></person-group><article-title>Use of electronic clinical documentation: time spent and team interactions</article-title><source>J Am Med Inform Assoc</source><year>2011</year><volume>18</volume><issue>2</issue><fpage>112</fpage><lpage>117</lpage><pub-id pub-id-type="doi">10.1136/jamia.2010.008441</pub-id><pub-id pub-id-type="medline">21292706</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mann</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chunara</surname><given-names>R</given-names> </name><name name-style="western"><surname>Testa</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Nov</surname><given-names>O</given-names> </name></person-group><article-title>COVID-19 transforms health care through telemedicine: evidence from the field</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>07</month><day>1</day><volume>27</volume><issue>7</issue><fpage>1132</fpage><lpage>1135</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa072</pub-id><pub-id pub-id-type="medline">32324855</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension</article-title><source>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</source><year>2020</year><publisher-name>Association for Computational Linguistics</publisher-name><pub-id pub-id-type="doi">10.18653/v1/2020.acl-main.703</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Saleh</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>P</given-names> </name></person-group><article-title>PEGASUS: pre-training with extracted gap-sentences for abstractive summarization</article-title><source>Proc Mach Learn Res</source><year>2020</year><volume>119</volume><fpage>11328</fpage><lpage>11339</lpage></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ozler</surname><given-names>KB</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name></person-group><article-title>Clulab at MEDIQA-Chat 2023: summarization and classification of medical dialogues</article-title><source>Proceedings of the 5th Clinical Natural Language Processing Workshop</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>144</fpage><lpage>149</lpage></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Feldman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>A</given-names> </name></person-group><article-title>Team Cadence at MEDIQA-Chat 2023: generating, augmenting and summarizing clinical dialogue with large language models</article-title><source>Proceedings of the 5th Clinical Natural Language Processing Workshop</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>228</fpage><lpage>235</lpage></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Mitra</surname><given-names>A</given-names> </name><name name-style="western"><surname>Osebe</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name></person-group><article-title>UMASS_BioNLP at MEDIQA-Chat 2023: can LLMs generate high-quality synthetic note-oriented doctor-patient conversations?</article-title><source>Proceedings of the 5th Clinical Natural Language Processing Workshop</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>460</fpage><lpage>471</lpage></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Suri</surname><given-names>K</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name></person-group><article-title>HealthMavericks@MEDIQA-Chat 2023: benchmarking different transformer based models for clinical dialogue summarization</article-title><source>Proceedings of the 5th Clinical Natural Language Processing Workshop</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>472</fpage><lpage>489</lpage></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gerstein</surname><given-names>M</given-names> </name></person-group><article-title>GersteinLab at MEDIQA-Chat 2023: clinical note summarization from doctor-patient conversations through fine-tuning and in-context learning</article-title><source>arXiv</source><comment>Preprint posted online on  May 8, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.05001</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Srivastava</surname><given-names>D</given-names> </name></person-group><article-title>IUTEAM1 at MEDIQA-Chat 2023: is simple fine tuning effective for multi layer summarization of clinical conversations?</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 7, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.04328</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Milintsevich</surname><given-names>K</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>N</given-names> </name></person-group><article-title>Calvados at MEDIQA-Chat 2023: improving clinical note generation with multi-task instruction finetuning</article-title><source>Proceedings of the 5th Clinical Natural Language Processing Workshop</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>529</fpage><lpage>535</lpage></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>R</given-names> </name><name name-style="western"><surname>Teodoro</surname><given-names>D</given-names> </name></person-group><article-title>DS4DH at MEDIQA-Chat 2023: leveraging SVM and GPT-3 prompt engineering for medical dialogue classification and summarization</article-title><source>Proceedings of the 5th Clinical Natural Language Processing Workshop</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>536</fpage><lpage>545</lpage></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mathur</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rangreji</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>R</given-names> </name><name name-style="western"><surname>Palavalli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bertsch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gormley</surname><given-names>MR</given-names> </name></person-group><article-title>SummQA at MEDIQA-Chat 2023: in-context learning with GPT-4 for medical summarization</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 30, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.17384</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heilmeyer</surname><given-names>F</given-names> </name><name name-style="western"><surname>B&#x00F6;hringer</surname><given-names>D</given-names> </name><name name-style="western"><surname>Reinhard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Arens</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lyssenko</surname><given-names>L</given-names> </name><name name-style="western"><surname>Haverkamp</surname><given-names>C</given-names> </name></person-group><article-title>Viability of open large language models for clinical documentation in German health care: real-world model evaluation study</article-title><source>JMIR Med Inform</source><year>2024</year><month>08</month><day>28</day><volume>12</volume><fpage>e59617</fpage><pub-id pub-id-type="doi">10.2196/59617</pub-id><pub-id pub-id-type="medline">39195570</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>P Ma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Boukil</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Fine-tuning methods for large language models in clinical medicine by supervised fine-tuning and direct preference optimization: comparative evaluation</article-title><source>J Med Internet Res</source><year>2025</year><month>09</month><day>23</day><volume>27</volume><fpage>e76048</fpage><pub-id pub-id-type="doi">10.2196/76048</pub-id><pub-id pub-id-type="medline">40986888</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Tiedemann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thottingal</surname><given-names>S</given-names> </name></person-group><article-title>OPUS-MT &#x2013; building open translation services for the world</article-title><source>Proceedings of the 22nd Annual Conference of the European Association for Machine Translation</source><year>2020</year><publisher-name>European Association for Machine Translation</publisher-name><fpage>479</fpage><lpage>480</lpage></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Houlsby</surname><given-names>N</given-names> </name><name name-style="western"><surname>Giurgiu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jastrzebski</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Parameter-efficient transfer learning for NLP</article-title><source>Proc Mach Learn Res</source><year>2019</year><volume>97</volume><fpage>2790</fpage><lpage>2799</lpage></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Dettmers</surname><given-names>T</given-names> </name><name name-style="western"><surname>Holtzman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pagnoni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zettlemoyer</surname><given-names>L</given-names> </name></person-group><article-title>QLORA: efficient finetuning of quantized LLMs</article-title><source>NIPS &#x2019;23: Proceedings of the 37th International Conference on Neural Information Processing Systems</source><year>2023</year><publisher-name>Curran Associates Inc</publisher-name></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>CY</given-names> </name></person-group><article-title>ROUGE: a package for automatic evaluation of summaries</article-title><source>Text Summarization Branches Out</source><year>2004</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>74</fpage><lpage>81</lpage></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kishore</surname><given-names>V</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name><name name-style="western"><surname>Artzi</surname><given-names>Y</given-names> </name></person-group><article-title>BERTScore: evaluating text generation with BERT</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 19, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1904.09675</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Cano</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Romanou</surname><given-names>A</given-names> </name><etal/></person-group><article-title>MEDITRON-70B: scaling medical pretraining for large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 23, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.16079</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 17, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nair</surname><given-names>V</given-names> </name><name name-style="western"><surname>Katariya</surname><given-names>N</given-names> </name><name name-style="western"><surname>Amatriain</surname><given-names>X</given-names> </name><name name-style="western"><surname>Valmianski</surname><given-names>I</given-names> </name><name name-style="western"><surname>Kannan</surname><given-names>A</given-names> </name></person-group><article-title>Adding more data does not always help: a study in medical conversation summarization with PEGASUS</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 15, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2111.07564</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Rozi&#x00E8;re</surname><given-names>B</given-names> </name><name name-style="western"><surname>Gehring</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gloeckle</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Code Llama: open foundation models for code</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 24, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.12950</pub-id></nlm-citation></ref></ref-list></back></article>