<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e71176</article-id><article-id pub-id-type="doi">10.2196/71176</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Language Models for Multilabel Document Classification of Surgical Concepts in Exploratory Laparotomy Operative Notes: Algorithm Development Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Balch</surname><given-names>Jeremy A</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Desaraju</surname><given-names>Sasank S</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nolan</surname><given-names>Victoria J</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Vellanki</surname><given-names>Divya</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Buchanan</surname><given-names>Timothy R</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Brinkley</surname><given-names>Lindsey M</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Penev</surname><given-names>Yordan</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bilgili</surname><given-names>Ahmet</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Patel</surname><given-names>Aashay</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chatham</surname><given-names>Corinne E</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Vanderbilt</surname><given-names>David M</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Uddin</surname><given-names>Rayon</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bihorac</surname><given-names>Azra</given-names></name><degrees>MS, MD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Efron</surname><given-names>Philip</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Loftus</surname><given-names>Tyler J</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rahman</surname><given-names>Protiva</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Shickel</surname><given-names>Benjamin</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Surgery, University of Florida College of Medicine</institution><addr-line>Gainesville</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Health Outcomes and Biomedical Informatics, University of Florida College of Medicine</institution><addr-line>Gainesville</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff3"><institution>University of Florida College of Medicine</institution><addr-line>Gainesville</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff4"><institution>Intelligent Clinical Care Center, University of Florida</institution><addr-line>Gainesville</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Medicine, University of Florida College of Medicine</institution><addr-line>1600 SW Archer Road, PO Box 100224</addr-line><addr-line>Gainesville</addr-line><addr-line>FL</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Jain</surname><given-names>Aditi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Choi</surname><given-names>Dong Hyun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ebrahim</surname><given-names>Mansoor Veliyathnadu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Benjamin Shickel, PhD, Department of Medicine, University of Florida College of Medicine, 1600 SW Archer Road, PO Box 100224, Gainesville, FL, 32610, United States, 3522739958, 1 3522739221; <email>shickelb@ufl.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>9</day><month>7</month><year>2025</year></pub-date><volume>13</volume><elocation-id>e71176</elocation-id><history><date date-type="received"><day>11</day><month>01</month><year>2025</year></date><date date-type="rev-recd"><day>15</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>15</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jeremy A Balch, Sasank S Desarju, Victoria J Nolan, Divya Vallanki, Timothy R Buchanan, Lindsey M Brinkley, Yordan Penev, Ahmet Bilgili, Aashay Patel, Corinne E Chatham, David M Vanderbilt, Rayon Uddin, Azra Bihorac, Philip Efron, Tyler J Loftus, Protiva Rahman, Benjamin Shickel. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 9.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2025/1/e71176"/><abstract><sec><title>Background</title><p>Operative notes are frequently mined for surgical concepts in clinical care, research, quality improvement, and billing, often requiring hours of manual extraction. These notes are typically analyzed at the document level to determine the presence or absence of specific procedures or findings (eg, whether a hand-sewn anastomosis was performed or contamination occurred). Extracting several binary classification labels simultaneously is a multilabel classification problem. Traditional natural language processing approaches&#x2014;bag-of-words (BoW) and term frequency-inverse document frequency (tf-idf) with linear classifiers&#x2014;have been used previously for this task but are now being augmented or replaced by large language models (LLMs). However, few studies have examined their utility in surgery.</p></sec><sec><title>Objective</title><p>We developed and evaluated LLMs for the purpose of expediting data extraction from surgical notes.</p></sec><sec sec-type="methods"><title>Methods</title><p>A total of 388 exploratory laparotomy notes from a single institution were annotated for 21 concepts related to intraoperative findings, intraoperative techniques, and closure techniques. Annotation consistency was measured using the Cohen &#x03BA; statistic. Data were preprocessed to include only the description of the procedure. We compared the evolution of document classification technologies from BoW and tf-idf to encoder-only (Clinical-Longformer) and decoder-only (Llama 3) transformer models. Multilabel classification performance was evaluated with 5-fold cross-validation with <italic>F</italic><sub>1</sub>-score and hamming loss (HL). We experimented with and without context. Errors were assessed by manual review. Code and implementation instructions may be found on GitHub.</p></sec><sec sec-type="results"><title>Results</title><p>The prevalence of labels ranged from 0.05 (colostomy, ileostomy, active bleed from named vessel) to 0.50 (running fascial closure). Llama 3.3 was the overall best-performing model (micro <italic>F</italic><sub>1</sub>-score 0.88, 5-fold range: 0.88-0.89; HL 0.11, 5-fold range: 0.11-0.12). The BoW model (micro <italic>F</italic><sub>1</sub>-score 0.68, 5-fold range: 0.64-0.71; HL 0.14, 5-fold range: 0.13-0.16) and Clinical-Longformer (micro <italic>F</italic><sub>1</sub>-score 0.73, 5-fold range: 0.70-0.74; HL 0.11, 5-fold range: 0.10-0.12) had overall similar performance, with tf-idf models trailing (micro <italic>F</italic><sub>1</sub>-score 0.57, 5-fold range: 0.55-0.59; HL 0.27, 5-fold range: 0.25-0.29). <italic>F</italic><sub>1</sub>-scores varied across concepts in the Llama model, ranging from 0.30 (5-fold range: 0.23-0.39) for class III contamination to 0.92 (5-fold range: 0.98-0.84) for bowel resection. Context enhanced Llama&#x2019;s performance, adding an average of 0.16 improvement to the <italic>F</italic><sub>1</sub>-scores. Error analysis demonstrated semantic nuances and edge cases within operative notes, particularly when patients had references to prior operations in their operative notes or simultaneous operations with other surgical services.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Off-the-shelf autoregressive LLMs outperformed fined-tuned, encoder-only transformers and traditional natural language processing techniques in classifying operative notes. Multilabel classification with LLMs may streamline retrospective reviews in surgery, though further refinements are required prior to reliable use in research and quality improvement.</p></sec></abstract><kwd-group><kwd>chart review</kwd><kwd>generative large language models</kwd><kwd>general surgery</kwd><kwd>natural language processing</kwd><kwd>exploratory laparotomy</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Operative notes represent the most thorough narrative of a surgical case in the electronic health record, containing information that is largely inaccessible outside of manual human review [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. This limitation impedes retrospective studies on surgical technique and intraoperative findings that impact outcomes, as well as the ability to perform prospective validation and real-time implementation of decision-support systems. Natural language processing (NLP) and large language models (LLMs) may offer a streamlined approach to information extraction for clinical workflow, education, research, performance improvement, and billing purposes [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>The terms and phrases used to characterize surgical techniques and intraoperative findings often contain complex dependencies that span multiple sentences and are best understood in the context of an entire operative note. Furthermore, in retrospective reviews focused on patient outcomes, operative notes serve as a vehicle to identify study participants, with subsequent attention to downstream outcomes often represented in structured data (ie, mortality, surgical site infection, or anastomotic leaks defined by the <italic>International Classification of Disease</italic> codes) [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>For this reason, we frame our problem in terms of a multilabel document classification task [<xref ref-type="bibr" rid="ref6">6</xref>] where operative notes take on a series of binary labels as to whether or not a certain intraoperative finding (eg, bleeding and contamination) or technique (eg, bowel resection, hand-sewn anastomosis, and style of fascial closure) occurred during the case. Traditional NLP methods, using word frequencies, generally perform well on this task, though can fail to capture context and negation, a noted strength of the attention mechanism in LLMs [<xref ref-type="bibr" rid="ref7">7</xref>]. Several studies have investigated LLMs for text classification in clinical notes, though to our knowledge, few studies have examined multilabel classification, only one has used generative models, and none have done so in surgical specialties [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. There is a similar paucity of publications using real-world data outside of curated datasets, which, in addition to representing idealized clinical documentation, are also conspicuously devoid of operative notes [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Generative LLMs may offer &#x201C;off-the-shelf&#x201D; abilities to capture the multidependency nature of intraoperative findings and surgical techniques. We hypothesize that generative LLMs can outperform fine-tuned encoder-only LLMs and traditional NLP methods in classifying operative notes as containing specific findings and techniques [<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data</title><p>Using the University of Florida Health Integrated Data Repository as an honest broker, we assessed 2 single-center, longitudinal electronic health record datasets for all adult patients admitted to a surgical service at University of Florida Health Gainesville and Jacksonville, both quaternary referral centers, between June 1, 2014, and August 22, 2022. We randomly selected 420 fully deidentified exploratory laparotomy operative reports using SQL queries. In total, 32 were found to be mislabeled as &#x201C;exploratory laparotomy,&#x201D; with no evidence that the abdominal cavity was entered, and so were excluded, leaving 388 notes. As our scope was limited to the operative notes themselves, no surgical outcome, operative metadata, or sociodemographic data were collected.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was approved by the University of Florida Institutional Review Board and Privacy Office (IRB#201600262) as an exempt study with a waiver of informed consent. All data used in this study were deidentified. No compensation was provided. This study was performed in accordance with the TRIPOD+LLM (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis+Large Language Model) reporting guideline [<xref ref-type="bibr" rid="ref17">17</xref>].</p></sec><sec id="s2-3"><title>Data Preparation</title><p>The project workflow is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. A team of 8 annotators, consisting of medical students (TRB, LMB, YP, A Bilgili, AP, CEC, RU, and DMV) and one surgical resident (JAB), were trained on the project&#x2019;s objectives and annotation software. A detailed annotation manual is provided with definitions, categories, and illustrative examples (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). An annotated operative note is shown in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Emphasis was placed on achieving a high level of consistency, with the goal of reaching a Cohen &#x03BA; coefficient of above 0.8 for interrater reliability [<xref ref-type="bibr" rid="ref18">18</xref>]. The first author (JAB) served as the ground truth. A total of 20 operative notes were set aside for annotator training and were reviewed by all annotators. Following training, annotators participated in regular discussions to address any challenges and were reviewed by the first author. Annotations were performed with Label Studio (version 1.8.2; HumanSignal).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow schema. Exploratory laparotomy notes are first extracted and annotated. After preprocessing, they are passed to 4 machine-learning models for multilabel document classification. Models are compared using several performance metrics. Finally, error analysis is performed and all annotation, preprocessing, prompts, and model architectures are modified as necessary on training data to optimize the <italic>F</italic><sub>1</sub>-score prior to evaluation on test data. AUPRC: area under the precision-recall curve; AUROC: area under the receiver operating characteristic curve; BoW: bag-of-words; CL: Clinical-Longformer; PPV: positive-predictive value; SN: sensitivity; SP: specificity; tf-idf: term frequency-inverse document frequency.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e71176_fig01.png"/></fig></sec><sec id="s2-4"><title>Labels</title><p>Notes were annotated for structure, intraoperative findings, and surgical techniques. Whole text spans were highlighted based on note structure: patient or staff or anesthesia personnel information; procedures performed; pre- and postoperative diagnoses; intraoperative findings; indication or history; description of the procedure; ins, outs, and specimens; disposition; and complications. Intraoperative findings included: contamination (class I, II, III, and IV) as defined in the peer-reviewed literature [<xref ref-type="bibr" rid="ref19">19</xref>]; and bleeding, differentiating between active bleed from a named vessel and active bleed from a solid organ. Whole-document labels were performed for: bowel resection, primary repair of enterotomies, colostomy formation, ileostomy formation, hand-sewn anastomosis, stapled anastomosis, placement of mesh, fascia closure techniques (running or continuous, interrupted, and left open), and skin closure techniques (full, Prevena, partial, and left open). For the training set, Cohen &#x03BA; across individual labels ranged from 0.39 to 1.0 with a mean and median agreement of 0.67 (SD 0.33) and 0.77 (IQR 0.52-1.0), respectively (Table S1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). The &#x03BA; scores across all medical students are shown in Table S1 <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. Because this was below our stated goal, additional training was provided with emphasis on these concepts, and each op note in the dataset was personally reviewed by the lead author. A total of 50 notes were annotated by each of the annotators, with the lead author annotating an additional 50.</p></sec><sec id="s2-5"><title>Data Splitting and Stratification for Class Imbalance</title><p>Standard techniques in multilabel classification tasks with label-specific class imbalances may result in datasets missing rare, positive labels [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. To account for this, we performed iterative stratification from scikit-multilearn, splitting the data into 5-fold of training (80%) and test (20%) sets [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. The distributions of the labels in each train and test set are shown in Table S2 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><p>Unlike other models, the Llama models were not fine-tuned on a hold-out training set. They were instead used to evaluate only the test set in each cross-validation fold.</p></sec><sec id="s2-6"><title>Models</title><p>We studied the traditional NLP multilabel document classification techniques with bag-of-words (BoW) and term frequency-inverse document frequency (tf-idf) approaches paired with logistic regression classifiers, as well as pretrained transformer models, the encoder-only Clinical-Longformer [<xref ref-type="bibr" rid="ref23">23</xref>], and the decoder-only Llama herd (Llama 3.1 -3b, 8b, 70b, 3.2, and 3.3) [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>BoW takes tokenized words and performs a classification task based on the frequency of the terms in a particular document. tf-idf applies a weight-based filter on the frequency of a term across a corpus of documents and evaluates the uniqueness of a word to a specific class.</p><p>Transformer-based models can leverage contextual information [<xref ref-type="bibr" rid="ref25">25</xref>]. Encoder models process the entire document by systematically masking these tokens and predicting their values. While encoder models typically excel at classification tasks, their utility is often limited by length, as most models cannot process more than 512 tokens at a time [<xref ref-type="bibr" rid="ref26">26</xref>]. Longformer models extend that range using both global and sliding-window attention mechanisms [<xref ref-type="bibr" rid="ref27">27</xref>]. Li et al [<xref ref-type="bibr" rid="ref23">23</xref>] fine-tuned a Clinical-Longformer model on clinical text from the Medical Information Mart for Intensive Care-III dataset [<xref ref-type="bibr" rid="ref15">15</xref>] with a context of 4096 tokens, which outperformed Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref28">28</xref>], ClinicalBERT [<xref ref-type="bibr" rid="ref29">29</xref>], and BioBERT [<xref ref-type="bibr" rid="ref30">30</xref>] on inference, question-answering, and classification tasks [<xref ref-type="bibr" rid="ref23">23</xref>]. Finally, autoregressive decoder&#x2013;only transformers estimate the probability distribution of the next token in a sequence based on the preceding tokens. As they are self-hosted, Llama allows for the secure handling of sensitive patient information and for this reason, these models were selected for this study [<xref ref-type="bibr" rid="ref24">24</xref>]. The results shown below are the best-performing Llama 3.3 model.</p></sec><sec id="s2-7"><title>Preprocessing</title><p>Notes were reduced to the &#x201C;description of procedure&#x201D; as other parts of the note may contain information from previous procedures that may bias the model. For the tf-idf and BoW models, all texts were converted to lowercase, and common stop words (eg, &#x201C;the,&#x201D; &#x201C;and,&#x201D; and &#x201C;is,&#x201D;), punctuation, and numbers were removed. Stemming and lemmatization were performed to reduce words to their root forms (eg, &#x201C;maturing&#x201D; to &#x201C;mature&#x201D;). The text was then vectorized as combinations of unigrams, bigrams, trigrams, and 4-grams. We introduced padding to ensure that all sequences had a uniform length. The Clinical-Longformer and Llama models were tokenized using the Hugging Face autotokenizer [<xref ref-type="bibr" rid="ref31">31</xref>].</p></sec><sec id="s2-8"><title>Model Hyperparameters</title><p>For BoW and tf-idf, we used logistic regression as our classifier. Hyperparameter search within each fold of the training data revealed marginally increased performance with L2-regularization strength of 0.1 and 10 for BoW and TFIDF overall, respectively. No other hyperparameters were modified based on the results of the test set. In the Clinical-Longformer model, we weighted the binary cross-entropy loss for each label inversely proportional to its prevalence in the training set given class imbalance. The model was optimized for the micro <italic>F</italic><sub>1</sub>-score and trained for up to 500 epochs with early stopping, using a patience of 10 to prevent overfitting. The inference was run on an NVIDIA A100 8GB graphics processing unit in the University of Florida HiPerGator cluster. The Llama 3.3 model had the longest runtime, at 723 minutes.</p><p>A custom Python script was developed using the LlamaIndex framework for the Llama model [<xref ref-type="bibr" rid="ref32">32</xref>]. Each task was a modified version of the annotation instructions, and the model was prompted with the operative note, the context of the task, few-shot instructions, a question, and a desired response format (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). A general context document was also provided and included brand names of mesh types, a description of types of skin closure, and other domain-specific knowledge that could aid in understanding patient notes and tasks (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). Given the 5-fold cross-validation design, all notes appeared in at least one test set. As a result, prompts were adjusted based on the model&#x2019;s generated rationale for randomly selected errors on the whole dataset (eg, differentiating &#x201C;primary repair&#x201D; from &#x201C;anastomosis&#x201D; or clarifying the use of &#x201C;prolene&#x201D; in mesh vs suture contexts). Performance metrics were not evaluated during prompt tuning to avoid test set leakage.</p></sec><sec id="s2-9"><title>Model Evaluation</title><p>Overall performance was evaluated using the micro <italic>F</italic><sub>1</sub>-score, which calculates the harmonic mean of precision and recall across all classes, and hamming loss (HL), which measures the fraction of misclassified labels relative to the total ground truth labels (with 0 indicating perfect classification). The mean and range of scores over 5 folds were reported. Optimal cutoffs were determined by maximizing the <italic>F</italic><sub>1</sub>-score in 0.01 increments. Sensitivity, positive predictive value (PPV), specificity, area under the receiver operating curve, and area under the precision-recall curve were also reported. Individual label <italic>F</italic><sub>1</sub>-scores were calculated using the &#x201C;binary&#x201D; average.</p></sec><sec id="s2-10"><title>Error Analysis</title><p>A total of 5 false positive and 5 false negative labels with the highest predicted probabilities were reviewed for each label using the best-performing Clinical-Longformer and Llama model. Several annotation errors were encountered during each iteration which resulted in manual reannotation by the lead author, repeat BoW, tf-idf, and Clinical-Longformer model training, and rerunning of the evaluation pipeline. The reported metrics reflect the latest training and evaluation.</p></sec><sec id="s2-11"><title>Data Availability and Code</title><p>Code and implementation instructions may be found on GitHub [<xref ref-type="bibr" rid="ref33">33</xref>]. A toy dataset is provided using GPT-generated op notes and random labels.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Data</title><p>Of the 388 operative notes, note length ranged from 73 to 1713 words, with a mean of 500 (SD 291) words and a median of 421 (IQR 292-603) words. Most notes were composed by the Trauma and Acute Care Surgery Department (n=267, 68.8%), with the remaining notes in Transplant Surgery (n=83, 21.4%) and Urology (n=30, 7.7%) along with combination cases with Vascular Surgery (n=24, 6.2%), Cardiothoracic Surgery (n=16, 4.1%), and Neurosurgery (n=8, 2.1%). We noted the class imbalance in the labels, as shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Prevalence of labels in the dataset.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Label</td><td align="left" valign="bottom">Prevalence</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Intraoperative findings</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Active bleeding from the named vessel</td><td align="left" valign="top">0.05</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Active bleeding from solid organ</td><td align="left" valign="top">0.11</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Class I</td><td align="left" valign="top">0.34</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Class II</td><td align="left" valign="top">0.48</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Class III</td><td align="left" valign="top">0.16</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Class IV</td><td align="left" valign="top">0.14</td></tr><tr><td align="left" valign="top" colspan="3">Intraoperative techniques</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Bowel resection</td><td align="left" valign="top">0.30</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Primary repair</td><td align="left" valign="top">0.05</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Serosal tear repair</td><td align="left" valign="top">0.05</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Colostomy</td><td align="left" valign="top">0.12</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ileostomy</td><td align="left" valign="top">0.08</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hand-sewn anastomosis</td><td align="left" valign="top">0.12</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Stapled anastomosis</td><td align="left" valign="top">0.15</td></tr><tr><td align="left" valign="top" colspan="3">Closure techniques</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Fascia closed (interrupted)</td><td align="left" valign="top">0.10</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Fascia closed (running or continuous)</td><td align="left" valign="top">0.50</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Fascia left open</td><td align="left" valign="top">0.32</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Skin closed (full with Prevena)</td><td align="left" valign="top">0.04</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Skin closed (full)</td><td align="left" valign="top">0.41</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Skin closed (partial)</td><td align="left" valign="top">0.05</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Skin left open</td><td align="left" valign="top">0.43</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Synthetic</td><td align="left" valign="top">0.06</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Collective Performance Across All Labels</title><p>Overall mean micro <italic>F</italic><sub>1</sub>-scores, along with minimum and maximum score per fold, are shown in <xref ref-type="table" rid="table2">Table 2</xref>. BoW (0.68, 5-fold range: 0.64&#x2010;0.71) outperformed tf-idf (0.57, 5-fold range: 0.55-0.59) overall with an increase in micro <italic>F</italic><sub>1</sub>-score of 0.1 and a decrease in HL of two-fold. Comparing the encoder-only and decoder-only model architectures, Llama 3.3 (0.88, 5-fold range: 0.88-0.89) had generous improvement overall in the micro <italic>F</italic><sub>1</sub>-score with equivalent HL to BoW and Clinical-Longformer.</p><p>We compared the Llama 3 series of models and observed a general trend of improved performance with increasing model size. An exception was Llama 3.2, which performed poorly&#x2014;consistent with prior reports of its reduced effectiveness on medical datasets [<xref ref-type="bibr" rid="ref34">34</xref>]. Results are presented in Figure S1 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Mean overall performance of models across all labels across all 5-folds<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Micro <italic>F</italic><sub>1</sub>-score, mean (range)</td><td align="left" valign="bottom">Hamming loss, mean (range)</td></tr></thead><tbody><tr><td align="left" valign="top">BoW<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">0.68 (0.64-0.71)</td><td align="left" valign="top">0.14 (0.13-0.16)</td></tr><tr><td align="left" valign="top">tf-idf<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.57 (0.55-0.59)</td><td align="left" valign="top">0.27 (0.25-0.29)</td></tr><tr><td align="left" valign="top">Clinical-Longformer</td><td align="left" valign="top">0.73 (0.70-0.74)</td><td align="left" valign="top">0.11 (0.10-0.12)</td></tr><tr><td align="left" valign="top">Llama 3.3</td><td align="left" valign="top">0.88 (0.88-0.89)</td><td align="left" valign="top">0.12 (0.11-0.12)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Values in parentheses indicate the minimum and maximum performance.</p></fn><fn id="table2fn2"><p><sup>b</sup>BoW: bag-of-words.</p></fn><fn id="table2fn3"><p><sup>c</sup>tf-idf: term frequency-inverse document frequency.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Individual Label Performance</title><p><italic>F</italic><sub>1</sub>-scores with ranges for the individual labels are visualized in <xref ref-type="fig" rid="figure2">Figure 2</xref> and shown numerically in <xref ref-type="table" rid="table3">Tables 3</xref><xref ref-type="table" rid="table4"/>-<xref ref-type="table" rid="table5">5</xref>. Intraoperative bleeding was well categorized by the Llama model, while surgical wound class was often better served by Clinical-Longformer or BoW models (<xref ref-type="fig" rid="figure2">Figure 2A</xref>). For the intraoperative technique (<xref ref-type="fig" rid="figure2">Figure 2B</xref>), the Llama model was the highest performer, with the Clinical-Longformer and BoW models performing with overlapping <italic>F</italic><sub>1</sub>-scores. Intraoperative and skin and fascial closure techniques were best served by the generative model. We noted excellent performance for the Llama 3.3 model in several categories with <italic>F</italic><sub>1</sub>-scores &#x2265;0.8. Of note, there was surprisingly poor performance on the Prevena label across all models, given that the brand name should often cue a positive class. Interrupted fascial closure was also noticeably poor, despite how this is often specifically stated in the operative note.</p><p><xref ref-type="table" rid="table3">Tables 3</xref><xref ref-type="table" rid="table4"/>-<xref ref-type="table" rid="table5">5</xref> demonstrate numeric values of the <italic>F</italic><sub>1</sub>-scores alongside sensitivity and PPVs. The Llama model was again the best performing overall with the notable exception of class II and stapled anastomosis labels. While the PPV of Llama was overall better, it performed poorly in 2 skin closure tasks, class III contamination task, and stapled anastomosis task. Full metrics across all models and labels are shown in Table S3 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p><italic>F</italic><sub>1</sub>-scores with error bars representing range over 5-fold cross-validation. BoW: bag-of-words; CL: Clinical-Longformer; ti-idf: term frequency-inverse document frequency.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v13i1e71176_fig02.png"/></fig><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of model performance across performance metrics for intraoperative findings.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="6">Intraoperative findings</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Active bleeding from named vessel</td><td align="left" valign="top">Active bleeding from solid organ</td><td align="left" valign="top">Class I</td><td align="left" valign="top">Class II</td><td align="left" valign="top">Class III</td><td align="left" valign="top">Class IV</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="char" char="." valign="top">0.31</td><td align="char" char="." valign="top">0.64</td><td align="char" char="." valign="top">0.62</td><td align="char" char="." valign="top">0.62</td><td align="char" char="." valign="top">0.35</td><td align="char" char="." valign="top">0.42</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="char" char="." valign="top">0.36</td><td align="char" char="." valign="top">0.6</td><td align="char" char="." valign="top">0.66</td><td align="char" char="." valign="top">0.68</td><td align="char" char="." valign="top">0.41</td><td align="char" char="." valign="top">0.48</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="char" char="." valign="top">0.44</td><td align="char" char="." valign="top">0.63</td><td align="char" char="." valign="top">0.73</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.46</td><td align="char" char="." valign="top">0.48</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.61</td><td align="char" char="." valign="top">0.84</td><td align="char" char="." valign="top">0.36</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.3</td><td align="char" char="." valign="top">0.48</td></tr><tr><td align="left" valign="top" colspan="7">SN<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW</td><td align="char" char="." valign="top">0.27</td><td align="char" char="." valign="top">0.61</td><td align="char" char="." valign="top">0.71</td><td align="char" char="." valign="top">0.66</td><td align="char" char="." valign="top">0.35</td><td align="char" char="." valign="top">0.43</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf</td><td align="char" char="." valign="top">0.55</td><td align="char" char="." valign="top">0.8</td><td align="char" char="." valign="top">0.89</td><td align="char" char="." valign="top">0.86</td><td align="char" char="." valign="top">0.55</td><td align="char" char="." valign="top">0.59</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL</td><td align="char" char="." valign="top">0.55</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.77</td><td align="char" char="." valign="top">0.76</td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.53</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.55</td><td align="char" char="." valign="top">0.96</td><td align="char" char="." valign="top">0.24</td><td align="char" char="." valign="top">0.9</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0.95</td></tr><tr><td align="left" valign="top" colspan="7">PPV<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW</td><td align="char" char="." valign="top">0.37</td><td align="char" char="." valign="top">0.71</td><td align="char" char="." valign="top">0.56</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.38</td><td align="char" char="." valign="top">0.46</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf</td><td align="char" char="." valign="top">0.42</td><td align="char" char="." valign="top">0.51</td><td align="char" char="." valign="top">0.53</td><td align="char" char="." valign="top">0.56</td><td align="char" char="." valign="top">0.34</td><td align="char" char="." valign="top">0.41</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL</td><td align="char" char="." valign="top">0.51</td><td align="char" char="." valign="top">0.58</td><td align="char" char="." valign="top">0.69</td><td align="char" char="." valign="top">0.68</td><td align="char" char="." valign="top">0.41</td><td align="char" char="." valign="top">0.46</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.77</td><td align="char" char="." valign="top">0.79</td><td align="char" char="." valign="top">0.65</td><td align="char" char="." valign="top">0.18</td><td align="char" char="." valign="top">0.33</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>BoW: bag-of-words.</p></fn><fn id="table3fn2"><p><sup>b</sup>tf-idf: term frequency-inverse document frequency.</p></fn><fn id="table3fn3"><p><sup>c</sup>CL: Clinical-Longformer.</p></fn><fn id="table3fn4"><p><sup>d</sup>SN: sensitivity.</p></fn><fn id="table3fn5"><p><sup>e</sup>PPV: positive predictive value.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Comparison of model performance across performance metrics for intraoperative techniques.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="7">Intraoperative techniques</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Bowel resection</td><td align="left" valign="top">Primary repair</td><td align="left" valign="top">Serosal tear repair</td><td align="left" valign="top">Colostomy</td><td align="left" valign="top">Ileostomy</td><td align="left" valign="top">Hand-sewn anastomosis</td><td align="left" valign="top">Stapled anastomosis</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="8"><italic>F</italic><sub>1</sub>-score</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="char" char="." valign="top">0.86</td><td align="char" char="." valign="top">0.22</td><td align="char" char="." valign="top">0.42</td><td align="char" char="." valign="top">0.51</td><td align="char" char="." valign="top">0.45</td><td align="char" char="." valign="top">0.61</td><td align="char" char="." valign="top">0.65</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.25</td><td align="char" char="." valign="top">0.43</td><td align="char" char="." valign="top">0.65</td><td align="char" char="." valign="top">0.69</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.61</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.37</td><td align="char" char="." valign="top">0.37</td><td align="char" char="." valign="top">0.76</td><td align="char" char="." valign="top">0.78</td><td align="char" char="." valign="top">0.65</td><td align="char" char="." valign="top">0.7</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.92</td><td align="char" char="." valign="top">0.5</td><td align="char" char="." valign="top">0.82</td><td align="char" char="." valign="top">0.7</td><td align="char" char="." valign="top">0.92</td><td align="char" char="." valign="top">0.71</td><td align="char" char="." valign="top">0.83</td></tr><tr><td align="left" valign="top" colspan="8">SN<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW</td><td align="char" char="." valign="top">0.86</td><td align="char" char="." valign="top">0.23</td><td align="char" char="." valign="top">0.32</td><td align="char" char="." valign="top">0.42</td><td align="char" char="." valign="top">0.39</td><td align="char" char="." valign="top">0.58</td><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>0.66</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf</td><td align="char" char="." valign="top">0.93</td><td align="char" char="." valign="top">0.3</td><td align="char" char="." valign="top">0.39</td><td align="char" char="." valign="top">0.7</td><td align="char" char="." valign="top">0.7</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.8</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL</td><td align="char" char="." valign="top">0.87</td><td align="char" char="." valign="top">0.42</td><td align="char" char="." valign="top">0.31</td><td align="char" char="." valign="top">0.78</td><td align="char" char="." valign="top">0.79</td><td align="char" char="." valign="top">0.84</td><td align="char" char="." valign="top">0.77</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.95</td><td align="char" char="." valign="top">0.4</td><td align="char" char="." valign="top">0.95</td><td align="char" char="." valign="top">0.67</td><td align="char" char="." valign="top">0.87</td><td align="char" char="." valign="top">0.73</td><td align="char" char="." valign="top">0.85</td></tr><tr><td align="left" valign="top" colspan="8">PPV<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW</td><td align="char" char="." valign="top">0.87</td><td align="char" char="." valign="top">0.22</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.73</td><td align="char" char="." valign="top">0.63</td><td align="char" char="." valign="top">0.68</td><td align="char" char="." valign="top">0.65</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.24</td><td align="char" char="." valign="top">0.65</td><td align="char" char="." valign="top">0.65</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL</td><td align="char" char="." valign="top">0.8</td><td align="char" char="." valign="top">0.35</td><td align="char" char="." valign="top">0.6</td><td align="char" char="." valign="top">0.79</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.56</td><td align="char" char="." valign="top">0.65</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.9</td><td align="char" char="." valign="top">0.7</td><td align="char" char="." valign="top">0.73</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">1</td><td align="char" char="." valign="top">0.71</td><td align="char" char="." valign="top">0.83</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>BoW: bag-of-words</p></fn><fn id="table4fn2"><p><sup>b</sup>tf-idf: term frequency-inverse document frequency.</p></fn><fn id="table4fn3"><p><sup>c</sup>CL: Clinical-Longformer.</p></fn><fn id="table4fn4"><p><sup>d</sup>SN: sensitivity.</p></fn><fn id="table4fn5"><p><sup>e</sup>PPV: positive predictive value.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Comparison of model performance across performance metrics for closure and mesh techniques.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="8">Closure and mesh techniques</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Fascia closed interrupted</td><td align="left" valign="top">Fascia closed continuous</td><td align="left" valign="top">Fascia left open</td><td align="left" valign="top">Skin closed (Prevena)</td><td align="left" valign="top">Skin closed (full)</td><td align="left" valign="top">Skin closed (partial)</td><td align="left" valign="top">Skin left open</td><td align="left" valign="top">Synthetic mesh</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="9"><italic>F</italic><sub>1</sub>-score</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="char" char="." valign="top">0.34</td><td align="char" char="." valign="top">0.86</td><td align="char" char="." valign="top">0.82</td><td align="char" char="." valign="top">0.15</td><td align="char" char="." valign="top">0.78</td><td align="char" char="." valign="top">0.2</td><td align="char" char="." valign="top">0.77</td><td align="char" char="." valign="top">0.55</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="char" char="." valign="top">0.42</td><td align="char" char="." valign="top">0.82</td><td align="char" char="." valign="top">0.78</td><td align="char" char="." valign="top">0.07</td><td align="char" char="." valign="top">0.74</td><td align="char" char="." valign="top">0.12</td><td align="char" char="." valign="top">0.76</td><td align="char" char="." valign="top">0.6</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="char" char="." valign="top">0.31</td><td align="char" char="." valign="top">0.88</td><td align="char" char="." valign="top">0.84</td><td align="char" char="." valign="top">0.13</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.23</td><td align="char" char="." valign="top">0.8</td><td align="char" char="." valign="top">0.81</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.6</td><td align="char" char="." valign="top">0.89</td><td align="char" char="." valign="top">0.92</td><td align="char" char="." valign="top">0.61</td><td align="char" char="." valign="top">0.89</td><td align="char" char="." valign="top">0.47</td><td align="char" char="." valign="top">0.9</td><td align="char" char="." valign="top">0.71</td></tr><tr><td align="left" valign="top" colspan="9">SN<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW</td><td align="char" char="." valign="top">0.36</td><td align="char" char="." valign="top">0.9</td><td align="char" char="." valign="top">0.88</td><td align="char" char="." valign="top">0.2</td><td align="char" char="." valign="top">0.84</td><td align="char" char="." valign="top">0.19</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.51</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf</td><td align="char" char="." valign="top">0.55</td><td align="char" char="." valign="top">0.96</td><td align="char" char="." valign="top">0.94</td><td align="char" char="." valign="top">0.1</td><td align="char" char="." valign="top">0.92</td><td align="char" char="." valign="top">0.09</td><td align="char" char="." valign="top">0.91</td><td align="char" char="." valign="top">0.6</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL</td><td align="char" char="." valign="top">0.27</td><td align="char" char="." valign="top">0.95</td><td align="char" char="." valign="top">0.89</td><td align="char" char="." valign="top">0.1</td><td align="char" char="." valign="top">0.91</td><td align="char" char="." valign="top">0.18</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.87</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.7</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.92</td><td align="char" char="." valign="top">0.51</td><td align="char" char="." valign="top">0.88</td><td align="char" char="." valign="top">0.74</td><td align="char" char="." valign="top">0.94</td><td align="char" char="." valign="top">0.87</td></tr><tr><td align="left" valign="top" colspan="9">PPV<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BoW</td><td align="char" char="." valign="top">0.46</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.77</td><td align="char" char="." valign="top">0.12</td><td align="char" char="." valign="top">0.74</td><td align="char" char="." valign="top">0.23</td><td align="char" char="." valign="top">0.73</td><td align="char" char="." valign="top">0.7</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>tf-idf</td><td align="char" char="." valign="top">0.4</td><td align="char" char="." valign="top">0.71</td><td align="char" char="." valign="top">0.67</td><td align="char" char="." valign="top">0.05</td><td align="char" char="." valign="top">0.63</td><td align="char" char="." valign="top">0.25</td><td align="char" char="." valign="top">0.66</td><td align="char" char="." valign="top">0.68</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>CL</td><td align="char" char="." valign="top">0.4</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.8</td><td align="char" char="." valign="top">0.2</td><td align="char" char="." valign="top">0.73</td><td align="char" char="." valign="top">0.33</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.8</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="char" char="." valign="top">0.56</td><td align="char" char="." valign="top">0.95</td><td align="char" char="." valign="top">0.92</td><td align="char" char="." valign="top">0.8</td><td align="char" char="." valign="top">0.91</td><td align="char" char="." valign="top">0.37</td><td align="char" char="." valign="top">0.87</td><td align="char" char="." valign="top">0.62</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>BoW: bag-of-words.</p></fn><fn id="table5fn2"><p><sup>b</sup>tf-idf: term frequency-inverse document frequency.</p></fn><fn id="table5fn3"><p><sup>c</sup>CL: Clinical-Longformer.</p></fn><fn id="table5fn4"><p><sup>d</sup>SN: sensitivity.</p></fn><fn id="table5fn5"><p><sup>e</sup>PPV: positive predictive value.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Context</title><p>We evaluated performance on the Llama 3.1-70b model with and without the context document. The model performed better overall with the context, with an average improvement of 0.16 in the <italic>F</italic><sub>1</sub>-score (Figure S2 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). The context offered the greatest improvement in serosal tear repair (+0.4) and the context hurt model performance in class III (&#x2212;0.19) and stapled-anastomosis (&#x2212;0.08) labels.</p></sec><sec id="s3-5"><title>Error Analysis</title><p>A manual review of 5 false negative and positive per label in the encoder-only and decoder-only models revealed several trends in errors, though often it was unclear why a model made a particular prediction. Overall, 88 annotations (0.01% of all annotations) were changed upon review, mostly in bowel resection (n=21), hand-sewn anastomosis (n=19), active bleed from solid organ (n=17), and serosal tear repair (n=13).</p><p>Examining the 3 overarching categories, for the encoder-only LLM, bleeds were often picked up, though generally any presence of bleeding was marked positive, regardless of its origins. For intraoperative techniques, false negative instances of bowel resection labels had a clear bowel resection performed in the case. False positives, however, occurred when previous bowel resections were mentioned in the operative report. This was especially true for take-back surgeries when the abdomen is left open because of the need for further surgery. For the ostomy concepts, the most common error was secondary to an ileostomy or colostomy take down (as opposed to creation) or a situation in which the bowel was left in discontinuity with discussion in the operative report of placing an ostomy later. For anastomosis, errors were often likely due to the presence of a stapled resection or the use of the stapler to create a common channel. For closure, fascial closure errors occurred in several cases where a thoracotomy was performed in the same operation as a laparotomy, resulting in the closure of one anatomic fascia and not another. Skin closure failures appeared to be confounded when multiple services operated on the same patient. Partial skin closures were underrepresented in the dataset and the model tended to predict partial closure on both full-closure and open skin with equal affinity.</p><p>For the decoder-only model, we had Llama provide explanations for its choice and the explanations along with the findings drove changes in prompting strategies. Performance on bleeding was overall excellent, however, &#x201C;oozing&#x201D; from an organ bed or resection was often assigned as &#x201C;active bleed,&#x201D; which our annotators and prompts were instructed to mark as negative. For intraoperative techniques, there were commonalities in errors with the Clinical-Longformer model, with prior bowel resections, ostomy takedowns, and instances where both stapled and hand-sewn anastomoses were performed in the same operation. Fascial closures were obscured by the presence of interrupted retention sutures. Several open skin closures were marked as both open and partial skin closures. For skin closure with Prevena and with the exception of some runs of Llama 3.3, the model appeared to simply not understand the instructions despite multiple prompting attempts.</p><p>Contamination was difficult to assess for both annotators and models and this information is not always clearly stated in operative reports. Identifying breaches in sterile techniques, purulent versus nonpurulent inflammation, and whether entry into a hollow organ resulted in spillage requires careful description. The generative model often assumed any entry to the abdominal cavity made for Class II or above, despite modifications to prompting techniques. Future studies will extract the attending surgeon attestation for ground truth labels of wound class, which may improve model performance.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>Generative LLMs outperformed fine-tuned encoder-only LLMs and traditional NLP models in a multilabel classification task across the majority of labels. Overall <italic>F</italic><sub>1</sub>-scores ranged from 0.57 for tf-idf to 0.88 for Llama 3.3. On individual labels, we had <italic>F</italic><sub>1</sub>-scores of &#x2265;0.8 for multiple classes.</p><p>Retrospective analyses drive decision support, quality improvement initiatives, and billing workflows, yet they are limited not only by the intensive manual review process but also by the variable interrater reliability with human labeling [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. To overcome these limitations, we frame operative concept identification as a multilabel document classification task and observe that the autoregressive Llama 3.3 model outperformed both traditional NLP techniques, the Longformer encoder model, and previous versions of the Llama herd.</p><p>State-of-the-art clinical NLP tasks rely on transformer-based, foundational LLMs [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. They have been used in the well-studied NLP tasks of medical questioning and answering [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref39">39</xref>-<xref ref-type="bibr" rid="ref41">41</xref>], summarization [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref42">42</xref>], named-entity recognition [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref43">43</xref>-<xref ref-type="bibr" rid="ref46">46</xref>], and document classification [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref47">47</xref>-<xref ref-type="bibr" rid="ref49">49</xref>]. Studies have largely focused on progress notes, histories and physicals, and discharge summaries, with an interest in the concepts of medications, diseases, and social determinants of health. There are fewer studies on operative notes and available research focuses on word embeddings for prediction tasks rather than individual entities [<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref52">52</xref>]. Furthermore, even fewer works have been published using state-of-the-art transformers and foundational LLMs in surgery [<xref ref-type="bibr" rid="ref53">53</xref>]. This is to our detriment as surgeons, as LLMs are capable of zero-shot learning (the ability to perform tasks without prior examples) and, if performing reliably, may obviate the need for manual chart review in retrospective research [<xref ref-type="bibr" rid="ref38">38</xref>]. To our knowledge, this paper is the first to explore operative concepts using LLMs as a multilabel classification task in surgery.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>Compared with other document classification tasks, our model compared well. A previous multilabel documentation task on chest x-ray reports showed that pretrained models had <italic>F</italic><sub>1</sub>-score ranges of 0.29 to 0.48 [<xref ref-type="bibr" rid="ref23">23</xref>]. Notably, traditional BoW-based approaches performed well across many classes. This is not surprising, as depending on the concept, the presence of a word or phrase in operative notes is often sufficient to identify it in text. tf-idf likely underperformed compared to BoW due to the dataset size: limited term frequencies and few documents may favor equal representations of words compared with weighted representations [<xref ref-type="bibr" rid="ref54">54</xref>]. For many tasks, context may simply not be important. For example, negation is less commonly used, as surgeons will typically describe what they did rather than explain what they did not. In terms of the <italic>F</italic><sub>1</sub>-score, the generative model offered the most benefit in identifying active bleeding, bowel resection, serosal tear repair, and closure techniques, which are highly context-dependent and rely on the integration of up to several sentences of information. Notably, Clinical-Longformer did not offer much benefit over the BoW model. This may be secondary to the fact that Medical Information Mart for Intensive Care-III does not contain comprehensive operative notes [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec><sec id="s4-3"><title>Limitations</title><p>This study has several limitations. First, exploratory laparotomies represent a difficult case for both human and machine understanding. These operations are, by definition, exploratory, often performed in an emergent setting, can require input from multiple surgical services, and present challenging traumatic and aberrant anatomy. Thus, the language may be less consistent than elective procedures. Nevertheless, we chose basic operative concepts and a common procedure to start our investigation into multilabel document classification. Second, understanding operative reports requires highly technical knowledge. Training annotators, including those with clinical experience, presents challenges, and, despite regular review, there may be instances of inaccurate labeling. To maximize the number of notes, we did not perform a second round of interrater reliability testing, though each note was reviewed by the lead author. As with many other studies, this points to the potential for variability in human annotation, and granting consistency of model outputs may show the potential advantages of LLM augmentation for this task. Third, we acknowledge that the 5-fold training and testing mechanism may result in overly optimistic performance in the BoW, tf-idf models, and Clinical-Longformer models. However, despite this, the untrained Llama model still outperformed the three other classifiers. Fourth, during prompt tuning, we evaluated a random limited subset of the data during exploration, raising the possibility of data leakage. However, we did not examine performance metrics during prompt tuning and focused on model reasoning rather than the label choice itself. Fifth, the distribution of predictions varied by label in BoW, tf-idf, and Clinical-Longformer, though many were left-skewed, suggesting low confidence. More data may improve the performance of these models.</p></sec><sec id="s4-4"><title>Conclusions</title><p>Given the performance of the off-the-shelf generative model, future studies will incorporate multiple labeled datasets from previous and ongoing retrospective studies at our institution with the goal of human-in-the-loop, streamlined extraction of operative concepts integrated into the research workflow. Future work in agentic retrieval augmented generation with hybrid approaches of keyword search and semantic matching may fit this purpose well [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. We noted improvements in model performance using larger Llama models, a trend we expect to continue as more advanced models are released.</p><p>While the use of multilabel document classification may be used to reliably capture select operative concepts with LLMs, further investigation of edge cases and alternative model architectures, such as retrieval augmented generation, will be required prior to deployment for research and quality improvement purposes.</p></sec></sec></body><back><ack><p>Funding for JAB was provided by the National Institute of General Medicine (T32 GM008721-24). Funding for TJL was provided by the National Institute of General Medicine (R01 GM149657). Funding for BS and A Bihorac was provided by the National Institute of General Medicine (R01 GM110240).</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are not publicly available because they contain protected health information that could be used for the reidentification of participants. Data are available from the University of Florida Institutional Data Access or Ethics Committee for researchers who meet the criteria for access to confidential data and may require additional institutional review board approval.</p></sec></notes><fn-group><fn fn-type="con"><p>JAB, SSD, VJN, DV, BS, PR, and TJL contributed to the conceptualization and methodology of the study, including approaches to annotation, data splitting, and modeling. Annotations were conducted by JAB, SSD, VJN, DV, TRB, LMB, YP, A Bilgili, AP, CEC, DMV, and RU. JAB and SSD were responsible for data visualization. Funding for the project was acquired by BS, TJL, A Bihorac, and PE. Project administration was led by JAB. Supervision was provided by BS, PR, TJL, A Bihorac, and PE. The original draft of the manuscript was written by JAB, SSD, VJN, DV, BS, PR, and TJL, with all authors contributing to the review and editing of the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb2">BoW</term><def><p>bag-of-words</p></def></def-item><def-item><term id="abb3">HL</term><def><p>hamming loss</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb6">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb7">tf-idf</term><def><p>term frequency-inverse document frequency</p></def></def-item><def-item><term id="abb8">TRIPOD+LLM</term><def><p>Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis+Large Language Model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Melton</surname><given-names>GB</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Arsoniadis</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Analyzing operative note structure in development of a section header resource</article-title><source>Stud Health Technol Inform</source><year>2015</year><volume>216</volume><issue>821-6</issue><fpage>821</fpage><lpage>826</lpage><pub-id pub-id-type="medline">26262166</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Maldonado</surname><given-names>R</given-names> </name><name name-style="western"><surname>Goodwin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Harabagiu</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Skinner</surname><given-names>MA</given-names> </name></person-group><article-title>The role of semantic and discourse information in learning the structure of surgical procedures</article-title><conf-name>2015 International Conference on Healthcare Informatics</conf-name><conf-date>Oct 21-23, 2015</conf-date><conf-loc>Dallas, TX, USA</conf-loc><pub-id pub-id-type="doi">10.1109/ICHI.2015.34</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>YX</given-names> </name><name name-style="western"><surname>Gan</surname><given-names>WS</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>ZF</given-names> </name><name name-style="western"><surname>Qi</surname><given-names>ZL</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>PS</given-names> </name></person-group><article-title>Large language models for medicine: a survey</article-title><source>Int J Mach Learn Cyber</source><year>2025</year><month>02</month><volume>16</volume><issue>2</issue><fpage>1015</fpage><lpage>1040</lpage><pub-id pub-id-type="doi">10.1007/s13042-024-02318-w</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>McKenzie</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Bossuyt</surname><given-names>PM</given-names> </name><etal/></person-group><article-title>The PRISMA 2020 statement: an updated guideline for reporting systematic reviews</article-title><source>BMJ</source><year>2021</year><month>03</month><day>29</day><volume>372</volume><fpage>n71</fpage><pub-id pub-id-type="doi">10.1136/bmj.n71</pub-id><pub-id pub-id-type="medline">33782057</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Siems</surname><given-names>A</given-names> </name><name name-style="western"><surname>Banks</surname><given-names>R</given-names> </name><name name-style="western"><surname>Holubkov</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Structured chart review: assessment of a structured chart review methodology</article-title><source>Hosp Pediatr</source><year>2020</year><month>01</month><volume>10</volume><issue>1</issue><fpage>61</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1542/hpeds.2019-0225</pub-id><pub-id pub-id-type="medline">31879317</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Palanivinayagam</surname><given-names>A</given-names> </name><name name-style="western"><surname>El-Bayeh</surname><given-names>CZ</given-names> </name><name name-style="western"><surname>Dama&#x0161;evi&#x010D;ius</surname><given-names>R</given-names> </name></person-group><article-title>Twenty years of machine-learning-based text classification: a systematic review</article-title><source>Algorithms</source><year>2023</year><volume>16</volume><issue>5</issue><fpage>236</fpage><pub-id pub-id-type="doi">10.3390/a16050236</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fields</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chovanec</surname><given-names>K</given-names> </name><name name-style="western"><surname>Madiraju</surname><given-names>P</given-names> </name></person-group><article-title>A survey of text classification with transformers: how wide? How large? How long? How accurate? How expensive? How safe?</article-title><source>IEEE Access</source><year>2024</year><volume>12</volume><fpage>6518</fpage><lpage>6531</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2024.3349952</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blanco</surname><given-names>A</given-names> </name><name name-style="western"><surname>Remmer</surname><given-names>S</given-names> </name><name name-style="western"><surname>P&#x00E9;rez</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dalianis</surname><given-names>H</given-names> </name><name name-style="western"><surname>Casillas</surname><given-names>A</given-names> </name></person-group><article-title>Implementation of specialised attention mechanisms: ICD-10 classification of gastrointestinal discharge summaries in English, Spanish and Swedish</article-title><source>J Biomed Inform</source><year>2022</year><month>06</month><volume>130</volume><fpage>104050</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2022.104050</pub-id><pub-id pub-id-type="medline">35346854</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chaichulee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Promchai</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kaewkomon</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kongkamol</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ingviya</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sangsupawanich</surname><given-names>P</given-names> </name></person-group><article-title>Multi-label classification of symptom terms from free-text bilingual adverse drug reaction reports using natural language processing</article-title><source>PLoS One</source><year>2022</year><volume>17</volume><issue>8</issue><fpage>e0270595</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0270595</pub-id><pub-id pub-id-type="medline">35925971</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sivura</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vendrow</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Spain</surname><given-names>DA</given-names> </name></person-group><article-title>TraumaICD bidirectional encoder representation from transformers: a natural language processing algorithm to extract injury International Classification of Diseases, 10th edition diagnosis code from free text</article-title><source>Ann Surg</source><year>2024</year><month>07</month><day>1</day><volume>280</volume><issue>1</issue><fpage>150</fpage><lpage>155</lpage><pub-id pub-id-type="doi">10.1097/SLA.0000000000006107</pub-id><pub-id pub-id-type="medline">37753654</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pedersen</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Laursen</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Soguero-Ruiz</surname><given-names>C</given-names> </name><name name-style="western"><surname>Savarimuthu</surname><given-names>TR</given-names> </name><name name-style="western"><surname>Hansen</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Vinholt</surname><given-names>PJ</given-names> </name></person-group><article-title>Domain over size: clinical ELECTRA surpasses general BERT for bleeding site classification in the free text of electronic health records</article-title><conf-name>2022 IEEE-EMBS International Conference on Biomedical and Health Informatics (BHI)</conf-name><conf-date>Sep 27-30, 2022</conf-date><conf-loc>Ioannina, Greece</conf-loc><pub-id pub-id-type="doi">10.1109/BHI56158.2022.9926955</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>De Santis</surname><given-names>E</given-names> </name><name name-style="western"><surname>Martino</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ronci</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rizzi</surname><given-names>A</given-names> </name></person-group><article-title>From bag-of-words to transformers: a comparative study for text classification in healthcare discussions in social media</article-title><source>IEEE Trans Emerg Top Comput Intell</source><year>2025</year><volume>9</volume><issue>1</issue><fpage>1063</fpage><lpage>1077</lpage><pub-id pub-id-type="doi">10.1109/TETCI.2024.3423444</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models: a systematic review</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id><pub-id pub-id-type="medline">39405325</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wornow</surname><given-names>M</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Thapa</surname><given-names>R</given-names> </name><etal/></person-group><article-title>The shaky foundations of large language models and foundation models for electronic health records</article-title><source>NPJ Digital Med</source><year>2023</year><month>07</month><day>29</day><volume>6</volume><issue>1</issue><fpage>135</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00879-8</pub-id><pub-id pub-id-type="medline">37516790</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-III, a freely accessible critical care database</article-title><source>Sci Data</source><year>2016</year><month>05</month><day>24</day><volume>3</volume><issue>1</issue><fpage>160035</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id><pub-id pub-id-type="medline">27219127</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Agrawal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hegselmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sontag</surname><given-names>D</given-names> </name></person-group><article-title>Large language models are few-shot clinical information extractors</article-title><source>arXiv</source><comment>Preprint posted online on  May 25, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2205.12689</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Andaur Navarro</surname><given-names>CL</given-names> </name><etal/></person-group><article-title>Protocol for development of a reporting guideline (TRIPOD-AI) and risk of bias tool (PROBAST-AI) for diagnostic and prognostic prediction model studies based on artificial intelligence</article-title><source>BMJ Open</source><year>2021</year><month>07</month><day>9</day><volume>11</volume><issue>7</issue><fpage>e048008</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-048008</pub-id><pub-id pub-id-type="medline">34244270</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHugh</surname><given-names>ML</given-names> </name></person-group><article-title>Interrater reliability: the kappa statistic</article-title><source>Biochem Med (Zagreb)</source><year>2012</year><volume>22</volume><issue>3</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="medline">23092060</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Dimick</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Upchurch</surname><given-names>GR</given-names> </name><name name-style="western"><surname>Alam</surname><given-names>HB</given-names> </name><etal/></person-group><source>Greenfield&#x2019;s Surgery: Scientific Principles and Practice</source><year>2021</year><publisher-name>Lippincott Williams &#x0026; Wilkins</publisher-name><pub-id pub-id-type="other">1975143183</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sechidis</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tsoumakas</surname><given-names>G</given-names> </name><name name-style="western"><surname>Vlahavas</surname><given-names>I</given-names> </name></person-group><article-title>On the stratification of multi-label data</article-title><conf-name>Machine Learning and Knowledge Discovery in Databases: European Conference, ECML PKDD 2011</conf-name><conf-date>Sep 5-9, 2011</conf-date><conf-loc>Athens, Greece</conf-loc></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Szyma&#x0144;ski</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kajdanowicz</surname><given-names>T</given-names> </name></person-group><article-title>A network perspective on stratification of multi-label data</article-title><conf-name>First International Workshop on Learning with Imbalanced Domains: Theory and Applications</conf-name><conf-date>Sep 22, 2017</conf-date><conf-loc>Skopje, Macedonia</conf-loc></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Szyma&#x0144;ski</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kajdanowicz</surname><given-names>T</given-names> </name></person-group><article-title>A scikit-based python environment for performing multi-label classification</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 5, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1702.01460</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wehbe</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>FS</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name></person-group><article-title>A comparative study of pretrained language models for long clinical text</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>01</month><day>18</day><volume>30</volume><issue>2</issue><fpage>340</fpage><lpage>347</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac225</pub-id><pub-id pub-id-type="medline">36451266</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nat New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Alawad</surname><given-names>M</given-names> </name><name name-style="western"><surname>Young</surname><given-names>MT</given-names> </name><etal/></person-group><article-title>Limitations of transformers on clinical text classification</article-title><source>IEEE J Biomed Health Inform</source><year>2021</year><month>09</month><volume>25</volume><issue>9</issue><fpage>3596</fpage><lpage>3607</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2021.3062322</pub-id><pub-id pub-id-type="medline">33635801</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Beltagy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>A</given-names> </name></person-group><article-title>Longformer: the long-document transformer</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 10, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2004.05150</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><conf-name>2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name><conf-date>Jun 2-7, 2019</conf-date><conf-loc>Minneapolis, Minnesota, United States</conf-loc></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Altosaar</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ranganath</surname><given-names>R</given-names> </name></person-group><article-title>ClinicalBERT: modeling clinical notes and predicting hospital readmission</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 10, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1904.05342</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wolf</surname><given-names>T</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanh</surname><given-names>V</given-names> </name><name name-style="western"><surname>Chaumond</surname><given-names>J</given-names> </name><name name-style="western"><surname>Delangue</surname><given-names>C</given-names> </name><name name-style="western"><surname>Moi</surname><given-names>A</given-names> </name><etal/></person-group><article-title>HuggingFace&#x2019;s transformers: state-of-the-art natural language processing</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 9, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.03771</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name></person-group><article-title>LlamaIndex</article-title><source>Github</source><year>2022</year><access-date>2024-09-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/jerryjliu/llama_index">https://github.com/jerryjliu/llama_index</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Balch</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Desaraju</surname><given-names>S</given-names> </name></person-group><article-title>Multi-label classification for operative notes</article-title><source>GitHub</source><year>2024</year><access-date>2024-09-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/balchja/op_note_multi_label_classification">https://github.com/balchja/op_note_multi_label_classification</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Pal</surname><given-names>A</given-names> </name></person-group><article-title>Performance comparison: llama-3 models in medical and healthcare AI domains</article-title><source>Hugg Face Repository</source><year>2024</year><access-date>2024-09-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/blog/aaditya/llama3-in-medical-domain">https://huggingface.co/blog/aaditya/llama3-in-medical-domain</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hanskamp-Sebregts</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zegers</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vincent</surname><given-names>C</given-names> </name><name name-style="western"><surname>van Gurp</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>de Vet</surname><given-names>HCW</given-names> </name><name name-style="western"><surname>Wollersheim</surname><given-names>H</given-names> </name></person-group><article-title>Measurement of patient safety: a systematic review of the reliability and validity of adverse event detection with record review</article-title><source>BMJ Open</source><year>2016</year><month>08</month><day>22</day><volume>6</volume><issue>8</issue><fpage>e011078</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2016-011078</pub-id><pub-id pub-id-type="medline">27550650</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lilford</surname><given-names>R</given-names> </name><name name-style="western"><surname>Edwards</surname><given-names>A</given-names> </name><name name-style="western"><surname>Girling</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Inter-rater reliability of case-note audit: a systematic review</article-title><source>J Health Serv Res Policy</source><year>2007</year><month>07</month><volume>12</volume><issue>3</issue><fpage>173</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1258/135581907781543012</pub-id><pub-id pub-id-type="medline">17716421</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clusmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kolbinger</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Muti</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>The future landscape of large language models in medicine</article-title><source>Commun Med (Lond)</source><year>2023</year><month>10</month><day>10</day><volume>3</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id><pub-id pub-id-type="medline">37816837</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="confproc"><person-group person-group-type="editor"><name name-style="western"><surname>Soni</surname><given-names>S</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>K</given-names> </name></person-group><article-title>Evaluation of dataset selection for pre-training and fine-tuning transformer language models for clinical question answering</article-title><conf-name>Proceedings of the Twelfth Language Resources and Evaluation Conference</conf-name><conf-date>May 11-16, 2020</conf-date><conf-loc>Marseille, France</conf-loc></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ng</surname><given-names>KKY</given-names> </name><name name-style="western"><surname>Matsuba</surname><given-names>I</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>PC</given-names> </name></person-group><article-title>RAG in health care: a novel framework for improving communication and decision-making by addressing LLM limitations</article-title><source>NEJM AI</source><year>2025</year><month>01</month><volume>2</volume><issue>1</issue><fpage>AIra2400380</fpage><pub-id pub-id-type="doi">10.1056/AIra2400380</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lucas</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pomeroy</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>CC</given-names> </name></person-group><article-title>Reasoning with large language models for medical question answering</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1964</fpage><lpage>1975</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae131</pub-id><pub-id pub-id-type="medline">38960731</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Van Veen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Uden</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blankemeier</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Clinical text summarization: adapting large language models can outperform human experts</article-title><source>Res Sq</source><year>2023</year><month>10</month><day>30</day><fpage>rs.3.rs-3483777</fpage><pub-id pub-id-type="doi">10.21203/rs.3.rs-3483777/v1</pub-id><pub-id pub-id-type="medline">37961377</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bose</surname><given-names>P</given-names> </name><name name-style="western"><surname>Srinivasan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sleeman</surname><given-names>WC</given-names> </name><name name-style="western"><surname>Palta</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>P</given-names> </name></person-group><article-title>A survey on recent named entity recognition and relationship extraction techniques on clinical texts</article-title><source>Appl Sci (Basel)</source><year>2021</year><month>09</month><volume>11</volume><issue>18</issue><fpage>8319</fpage><pub-id pub-id-type="doi">10.3390/app11188319</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="book"><person-group person-group-type="editor"><name name-style="western"><surname>Pagad</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Pradeep</surname><given-names>N</given-names> </name></person-group><article-title>Clinical named entity recognition methods: an overview</article-title><source>International Conference on Innovative Computing and Communications: Proceedings of ICICC</source><year>2022</year><volume>2</volume><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-981-16-2597-8_13</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name></person-group><article-title>Clinical named entity recognition using deep learning models</article-title><source>AMIA Annu Symp Proc</source><year>2017</year><volume>2017</volume><fpage>1812</fpage><lpage>1819</lpage><pub-id pub-id-type="medline">29854252</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name></person-group><article-title>RT: a retrieving and chain-of-thought framework for few-shot medical named entity recognition</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1929</fpage><lpage>1938</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae095</pub-id><pub-id pub-id-type="medline">38708849</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Berkowitz</surname><given-names>SJ</given-names> </name><etal/></person-group><article-title>MIMIC-CXR, a de-identified publicly available database of chest radiographs with free-text reports</article-title><source>Sci Data</source><year>2019</year><month>12</month><day>12</day><volume>6</volume><issue>1</issue><fpage>317</fpage><pub-id pub-id-type="doi">10.1038/s41597-019-0322-0</pub-id><pub-id pub-id-type="medline">31831740</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Srivastava</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name></person-group><article-title>Early prediction of acute kidney injury in critical care setting using clinical notes</article-title><source>Proc IEEE Int Conf Bioinformatics Biomed</source><year>2018</year><pub-id pub-id-type="doi">10.1109/BIBM.2018.8621574</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ovadje</surname><given-names>A</given-names> </name><name name-style="western"><surname>Al-Garadi</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Sarker</surname><given-names>A</given-names> </name></person-group><article-title>Evaluating large language models for health-related text classification tasks with public social media data</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>10</month><day>1</day><volume>31</volume><issue>10</issue><fpage>2181</fpage><lpage>2189</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae210</pub-id><pub-id pub-id-type="medline">39121174</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mellia</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Basta</surname><given-names>MN</given-names> </name><name name-style="western"><surname>Toyoda</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Natural language processing in surgery: a systematic review and meta-analysis</article-title><source>Ann Surg</source><year>2021</year><month>05</month><day>1</day><volume>273</volume><issue>5</issue><fpage>900</fpage><lpage>908</lpage><pub-id pub-id-type="doi">10.1097/SLA.0000000000004419</pub-id><pub-id pub-id-type="medline">33074901</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Selby</surname><given-names>LV</given-names> </name><name name-style="western"><surname>Narain</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Russo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Strong</surname><given-names>VE</given-names> </name><name name-style="western"><surname>Stetson</surname><given-names>P</given-names> </name></person-group><article-title>Autonomous detection, grading, and reporting of postoperative complications using natural language processing</article-title><source>Surgery</source><year>2018</year><month>12</month><volume>164</volume><issue>6</issue><fpage>1300</fpage><lpage>1305</lpage><pub-id pub-id-type="doi">10.1016/j.surg.2018.05.008</pub-id><pub-id pub-id-type="medline">30056994</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tibbo</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Wyles</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Use of natural language processing tools to identify and classify periprosthetic femur fractures</article-title><source>J Arthroplasty</source><year>2019</year><month>10</month><volume>34</volume><issue>10</issue><fpage>2216</fpage><lpage>2219</lpage><pub-id pub-id-type="doi">10.1016/j.arth.2019.07.025</pub-id><pub-id pub-id-type="medline">31416741</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bombieri</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rospocher</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ponzetto</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Fiorini</surname><given-names>P</given-names> </name></person-group><article-title>Surgicberta: a pre-trained language model for procedural surgical language</article-title><source>Int J Data Sci Anal</source><year>2024</year><month>06</month><volume>18</volume><issue>1</issue><fpage>69</fpage><lpage>81</lpage><pub-id pub-id-type="doi">10.1007/s41060-023-00433-5</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Jurafsky</surname><given-names>D</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>JH</given-names> </name></person-group><article-title>Speech and language processing: an introduction to natural language processing, computational linguistics, and speech recognition with language models</article-title><source>Stanford University</source><year>2024</year><access-date>2025-06-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://web.stanford.edu/~jurafsky/slp3/">https://web.stanford.edu/~jurafsky/slp3/</ext-link></comment></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkhalaf</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>C</given-names> </name></person-group><article-title>Applying generative AI with retrieval augmented generation to summarize and extract key clinical information from electronic health records</article-title><source>J Biomed Inform</source><year>2024</year><month>08</month><volume>156</volume><fpage>104662</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2024.104662</pub-id><pub-id pub-id-type="medline">38880236</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Perez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Piktus</surname><given-names>A</given-names> </name><name name-style="western"><surname>Petroni</surname><given-names>F</given-names> </name><name name-style="western"><surname>Karpukhin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title><source>Adv Neural Inf Process Syst</source><year>2020</year><access-date>2025-06-25</access-date><volume>33</volume><fpage>9459</fpage><lpage>9474</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2020/file/6b493230205f780e1bc26945df7481e5-Paper.pdf">https://proceedings.neurips.cc/paper/2020/file/6b493230205f780e1bc26945df7481e5-Paper.pdf</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Operative note annotation guideline.</p><media xlink:href="medinform_v13i1e71176_app1.docx" xlink:title="DOCX File, 24 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Sample operative note annotation.</p><media xlink:href="medinform_v13i1e71176_app2.docx" xlink:title="DOCX File, 1003 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Supplemental tables.</p><media xlink:href="medinform_v13i1e71176_app3.docx" xlink:title="DOCX File, 41 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Task prompts.</p><media xlink:href="medinform_v13i1e71176_app4.docx" xlink:title="DOCX File, 26 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Context.</p><media xlink:href="medinform_v13i1e71176_app5.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Supplemental figures.</p><media xlink:href="medinform_v13i1e71176_app6.docx" xlink:title="DOCX File, 469 KB"/></supplementary-material><supplementary-material id="app7"><label>Checklist 1</label><p>TRIPOD+LLM checklist.</p><media xlink:href="medinform_v13i1e71176_app7.pdf" xlink:title="PDF File, 379 KB"/></supplementary-material></app-group></back></article>