<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Med Inform</journal-id><journal-id journal-id-type="publisher-id">medinform</journal-id><journal-id journal-id-type="index">7</journal-id><journal-title>JMIR Medical Informatics</journal-title><abbrev-journal-title>JMIR Med Inform</abbrev-journal-title><issn pub-type="epub">2291-9694</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v14i1e87133</article-id><article-id pub-id-type="doi">10.2196/87133</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Natural Language Processing for Automated Classification of Cleft and Craniofacial Procedures From Operative Notes: Model Development and Feasibility Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Cox</surname><given-names>Meredith</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lin</surname><given-names>Elaine</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Oleck</surname><given-names>Nicholas</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jones</surname><given-names>Carlee</given-names></name><degrees>CCC-SLP</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Neill Y</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mithani</surname><given-names>Suhail K</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Allori</surname><given-names>Alexander C</given-names></name><degrees>MD, MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Division of Plastic, Oral, and Maxillofacial Surgery, Duke University Hospital</institution><addr-line>2301 Erwin Road</addr-line><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Orthopaedic Surgery, Duke University Hospital</institution><addr-line>Durham</addr-line><addr-line>NC</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Al-Agil</surname><given-names>Mohammad</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lou</surname><given-names>Zhouyang</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Alexander C Allori, MD, MPH, Division of Plastic, Oral, and Maxillofacial Surgery, Duke University Hospital, 2301 Erwin Road, Durham, NC, 27710, United States, 1 919-668-3110; <email>alexander.allori@duke.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>11</day><month>5</month><year>2026</year></pub-date><volume>14</volume><elocation-id>e87133</elocation-id><history><date date-type="received"><day>04</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>11</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>13</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Meredith Cox, Elaine Lin, Nicholas Oleck, Carlee Jones, Neill Y Li, Suhail K Mithani, Alexander C Allori. Originally published in JMIR Medical Informatics (<ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org">https://medinform.jmir.org</ext-link>), 11.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Medical Informatics, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://medinform.jmir.org/">https://medinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://medinform.jmir.org/2026/1/e87133"/><abstract><sec><title>Background</title><p>The accurate classification of operative notes is essential for surgical outcomes research; however, CPT code classification is notoriously nonspecific for many procedures. In such situations, the operative note (or &#x201C;dictation&#x201D;) must be reviewed manually, a process that is labor-intensive and unsustainable. Natural language processing demonstrates tremendous potential for improving the efficiency and accuracy of procedure classification from unstructured operative notes. To date, it remains unexplored whether natural language processing can reliably differentiate between complex, multicomponent procedures, such as those involved in the care of cleft lip or palate and craniofacial anomalies.</p></sec><sec><title>Objective</title><p>This study aims to develop and evaluate a machine learning framework for the automated classification of operative notes for cleft and craniofacial procedures.</p></sec><sec sec-type="methods"><title>Methods</title><p>This single-institution, retrospective observational study used operative notes from patients undergoing cleft and craniofacial procedures at a single academic medical center from 2016 to 2024. Each note in the database had been manually classified previously. Notes were preprocessed and vectorized using term frequency-inverse document frequency. A One-vs-Rest classification framework with random forest as the base classifier was developed to categorize procedures at 3 levels: primary procedure type (cleft lip repair, alveolar bone grafting, cleft palate repair, velopharyngeal insufficiency correction, rhinoplasty, and other), procedural subtype (primary vs revision), and specific surgical technique used (eg, Fisher, Mulliken, or rotation-advancement technique for cleft lip repair). Each hierarchical level was developed and evaluated using cross-validation. To improve procedural subtype classification for classes with few samples, synthetic notes were added to the dataset. Area under the receiver operating characteristic curve (AUC), an area under the precision-recall curve, micro- and macro-averaged <italic>F</italic><sub>1</sub>-scores, and Hamming loss were used to assess model performance.</p></sec><sec sec-type="results"><title>Results</title><p>The dataset comprised 630 operative notes from 311 pediatric patients undergoing cleft and craniofacial procedures between 2016 and 2024, with a mean age of 3.75 (range 0&#x2010;19) years. The primary classification model achieved strong performance in distinguishing procedure types with an AUC of 0.93 (SD 0.04), area under the precision-recall curve of 0.84 (SD 0.05), micro-averaged <italic>F</italic><sub>1</sub>-score of 0.88 (SD 0.02), a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.84 (SD 0.03), and a Hamming loss of 0.04 (SD 0.01). Secondary classifiers achieved AUC scores of 1.0 (SD 0.00) for cleft lip revision classification but failed to discriminate between alveolar bone grafting primary and revision procedures (AUC 0.49, SD 0.02). Tertiary classifiers for surgical technique identification showed AUC scores of 0.88 (SD 0.03), 0.89 (SD 0.03), and 0.89 (SD 0.09) for cleft lip, cleft palate, and velopharyngeal insufficiency repair techniques, respectively.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This pilot study demonstrates that machine learning approaches can automate the classification of pediatric craniofacial operative notes across multiple levels of procedural detail. The implementation of such systems could significantly reduce the administrative burden related to surgical research, operations, and quality improvement.</p></sec></abstract><kwd-group><kwd>machine learning</kwd><kwd>natural language processing</kwd><kwd>cleft lip</kwd><kwd>cleft palate</kwd><kwd>craniofacial abnormalities</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Modern health care generates an overwhelming amount of data, ranging from structured information, such as demographic data and laboratory values, to unstructured content, such as clinical notes and radiographic images. While structured data may be readily analyzed using traditional statistical analysis and machine learning approaches for tasks such as disease risk stratification and outcome prediction [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>], the wealth of clinical information contained in unstructured text data has remained largely untapped due to analytical challenges [<xref ref-type="bibr" rid="ref4">4</xref>]. In the past, the analysis of unstructured data required manual chart review to extract desired elements and structure them in a spreadsheet or database for later use&#x2014;a process that is labor-intensive, slow, costly, and subject to human error.</p><p>One example of where manual review of text data has been critically important is the review of operative notes (or &#x201C;dictations&#x201D;) that are recorded by a surgeon after a procedure has been completed. These notes contain essential information on the indication for the procedure (why it was performed), context or historical details about the patient leading to the decision, a description of the technical steps involved in the procedure, a summary of operative findings, and many other details. While standard coding systems, such as Current Procedural Terminology (CPT), can indicate with some generality the type of procedure performed, these codes are notoriously imprecise at representing the critical details about procedures, due to limitations in the codes themselves and variability in coding practices by surgeons and institutions [<xref ref-type="bibr" rid="ref5">5</xref>]. As a brief example, consider that primary cleft palate repair may be represented by CPT codes 42200, 42205, 42210, and 42235. Revision palatoplasty is ideally coded as 42215, but it may also be coded using the same 42200 and 42205 as a primary palatal repair. Oronasal fistula repair is often coded using 30600, except that it may also be coded using revision palatoplasty codes 42215 or 42235. An alveolar bone graft procedure has no specific code of its own, so it is often coded using the palatoplasty code 42210. There is nothing in the code itself that provides information about which procedure was performed, why it was performed, how it was performed, or what happened during the time it was being performed. It is not the CPT coding system that is at fault, as it was designed for billing purposes and functions adequately in that regard. The problem lies in the fact that researchers have relied upon the CPT code (a structured data field) because it is more easily accessible and analyzable than manually extracting the necessary information from operative notes. Unstructured data have historically been inaccessible, impenetrable, or impractical.</p><p>Fortunately, recent advances in natural language processing (NLP) and machine learning have demonstrated promising results in the classification of procedures across various specialties [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. These technologies offer potential solutions using machine learning for automatic procedural classification in fields such as orthopedics, general surgery, neurosurgery, ophthalmology, and anesthesiology, enabling more accurate and efficient clinical data retrieval and analysis. However, the application of these techniques to more complex procedures, such as cleft and pediatric craniofacial procedures, remains largely unexplored [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. A child with a cleft lip or palate or another craniofacial anomaly undergoes operative and nonoperative care, typically coordinated by a multidisciplinary team, in stages spanning 2 decades of the child&#x2019;s life, from birth through young adulthood. For children with some types of clefts, it is typical to undergo 4 to 6 cleft-related procedures in their lifetimes, and sadly, it is not uncommon to encounter patients who have undergone approximately 20 procedures. To decrease the number of anesthetic events for patients, many procedures are combined under the same anesthetic. The surgical techniques used vary considerably based on cleft phenotype, context, and surgeon preference. Such complexity in timing, sequence or staging, type of procedure, technique, coordination of procedures, and so forth makes cleft care the perfect testing ground for the refinement of NLP techniques for the extraction of critical details from the operative note.</p><p>In this pilot study, we developed and evaluated a machine learning framework for the automated classification of operative notes across multiple procedure types and subtypes related to cleft care. To address the &#x201C;black box&#x201D; nature of machine learning models, we used feature importance analysis to identify the specific textual elements that drive classification decisions. This work has the potential to enhance clinical documentation efficiency, facilitate large-scale quality improvement initiatives, and support comprehensive outcomes research in surgery.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Source</title><p>For this study, we extracted operative notes from all patients who underwent cleft and craniofacial procedures at a single institution from 2016 to 2024, comprising 656 operative notes from 312 patients. Operative notes were dictated by the primary surgeon immediately following the completion of the procedure. These notes were stored contemporaneously within the electronic medical record and were periodically aggregated into a condition-specific database as part of the guidelines recommended by the &#x201C;Standard Set of Outcome Metrics for the Comprehensive Appraisal of Cleft Care&#x201D; of the ICHOM (International Consortium for Health Outcomes Measurement) [<xref ref-type="bibr" rid="ref13">13</xref>] and the operational protocol of the Allied Cleft &#x0026; Craniofacial Quality-Improvement and Research Network (ACCQUIREnet) [<xref ref-type="bibr" rid="ref14">14</xref>], of which our institution is a member. Within the condition-specific databases, procedures are classified using the ACCQUIREnet Common Data Model. At our institution, this labeling has been performed by an experienced annotator (CJ), a speech pathologist (CCC-SLP), and the Team Coordinator of an American Cleft Palate Craniofacial Association&#x2013;approved Cleft &#x0026; Craniofacial Team since 2018. Any uncertainty in procedural classification was resolved by a senior surgeon (ACA) with over 20 years of clinical experience and the director of ACCQUIREnet. Procedural classification was performed for case type (eg, cleft lip repair, cleft palate repair, alveolar bone grafting), subtype or stage (primary repair or revision), and technique. Where operative notes contained the documentation of multiple procedures done in coordination (such as cleft lip repair being done with tip rhinoplasty, or cleft palate revision being done with sphincteroplasty for speech), each subprocedure was classified separately.</p></sec><sec id="s2-2"><title>Generation of Synthetic Notes to Augment Dataset</title><p>Some procedures were severely under-represented in our dataset. For these procedures, the dataset was augmented using synthetic notes generated by a large language model (LLM). To generate these synthetic operative notes, we used GPT-4o (OpenAI) [<xref ref-type="bibr" rid="ref15">15</xref>] with a multishot prompting strategy guided by manually anonymized, real operative notes. Example notes were drawn from a separate Duke cleft and craniofacial dataset collected outside the study period to guide the model in reproducing procedure-specific structure and terminology without introducing data leakage. To guide overall structure, 3 representative examples from an internal dataset not used in model development were provided for each procedure while encouraging variation in operative technique and language. Using this approach, we generated synthetic operative notes for cleft lip major revision, cleft lip minor revision, alveolar bone grafting revision, and velopharyngeal insufficiency repair by cleft palate revision with counts matched to the corresponding real-note datasets.</p><p>A senior cleft or craniofacial surgeon (ACA) reviewed the synthetic notes to appraise content validity. All notes were rated as being satisfactory in quality and content, similar to what would be found in clinical practice. Criticisms included unnecessary verbosity, slight disorganization in the operative sequence, limited detail in some parts of the operation, and lack of variation with regard to technical complexity or intraoperative events (some cases are harder than others). Notwithstanding, the limitations were actually viewed as advantageous to this project, as they might better simulate the varying quality of notes &#x201C;in the wild&#x201D;&#x2014;for example, what a junior resident or hurried attending surgeon might have dictated.</p><p>Synthetic notes were used solely for model development and not for evaluation.</p></sec><sec id="s2-3"><title>Preprocessing</title><p>Operative notes often contain content such as patient history, procedure indications, or other metadata. To isolate clinically relevant content, a custom Python script (version 3.11.9) extracted specific sections of interest (eg, &#x201C;indications,&#x201D; &#x201C;operative findings,&#x201D; and &#x201C;operative details&#x201D;) from the body of each note. This script identified and extracted content preceded by section headers, detected all candidate headers, and then split the note into segments at each detected header. Because the base template is auto-generated by the electronic health record, headings are consistently present; however, certain sections can be overwritten by the surgeon (dictated or typed as free text). The user specified desired content (eg, &#x201C;operative details&#x201D;) as input, and the script used a combination of exact and fuzzy string matching to identify synonymous headers (eg, &#x201C;operative technique&#x201D; or &#x201C;procedure description&#x201D;). To improve header recognition consistency across notes, the script optionally supported training for header deduplication using the dedupe library (version 3.0.3), which learns common aliases from the dataset and creates a mapping dictionary [<xref ref-type="bibr" rid="ref16">16</xref>]. For this process, a separate set of operative notes was used&#x2014;that is, notes that were not included in the dataset for subsequent vectorization and analysis. Using this mapping, operative notes were split into sections, which were saved for downstream analysis.</p><p>The procedure notes were preprocessed to prepare them for NLP analysis. The preprocessing step is crucial for standardizing the text data and reducing noise in subsequent analysis. The medium-sized English-language model from the spaCy NLP library was used for text processing (version 3.8.4). Each operative note underwent tokenization, lemmatization, and filtering to remove punctuation, stop words (such as &#x201C;the,&#x201D; &#x201C;a,&#x201D; &#x201C;and,&#x201D; &#x201C;in,&#x201D; &#x201C;of,&#x201D; and &#x201C;is&#x201D;), and nonalphabetic characters. Clinically relevant negation terms (such as &#x201C;no,&#x201D; &#x201C;not,&#x201D; and &#x201C;without&#x201D;) were preserved.</p></sec><sec id="s2-4"><title>Classifier Model Development</title><p>The classification framework used a One-vs-Rest (OvR) strategy with random forest as the base classifier. This approach was selected for several compelling reasons. First, OvR decomposes the complex multilabel problem into multiple binary classification problems (one for each procedure type), significantly reducing computational complexity while maintaining high performance. Second, this methodology preserves the interpretability of the model&#x2019;s predictions. Third, OvR classifiers handle class imbalance effectively when paired with appropriate weighting, which was particularly important given the uneven distribution of procedure types in the dataset. Class weighting of the random forest was used to address dataset imbalance, which penalizes the misclassification of minority classes more heavily during the training process. Given the small dataset size of 630 notes, deep learning methods were not likely to be optimal for the task, based on results from prior studies examining sample size requirements for medical NLP classification tasks with deep learning [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>The overall architectural design implements a hierarchical classification framework with 3 levels: primary procedure-type classification, secondary subtype classification for applicable procedures, and tertiary technique classification where sufficient data exist. This hierarchical propagation yields a comprehensive procedural characterization with progressive granularity. The classification architecture is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Classification schematic. A primary classifier categorizes an operative note by primary procedure (cleft lip, alveolar bone grafting [ABG], cleft palate, velopharyngeal insufficiency [VPI], or rhinoplasty). A secondary classifier further categorizes cleft lip and ABG procedures into primary and revision procedures. A tertiary classifier categorizes cleft lip, cleft palate, and VPI procedures by technique.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87133_fig01.png"/></fig><p>For the primary classifier, we used patient-level grouped 5-fold cross-validation to ensure all notes from each patient remained in either the training or test set, preventing potential information leakage through patient-specific documentation patterns. For secondary and tertiary classifiers, insufficient sample sizes (particularly for minority classes) precluded patient-level grouping while maintaining adequate class representation in each fold. Note-level grouping with 3-fold cross-validation was used for secondary and tertiary classifiers to ensure sufficient representation of minority classes within each fold.</p><p>Term frequency&#x2013;inverse document frequency (TF-IDF) vectorization converted the preprocessed text into a format suitable for machine learning analysis. This technique weights terms based on their frequency within individual documents relative to their occurrence across the entire corpus, effectively capturing the relative importance of specific terms within each operative note. TF-IDF was used for text vectorization due to its interpretability, allowing individual words to be linked to model predictions. Explainability leads to greater physician trust in medical artificial intelligence tools, and understanding how models are making decisions is important prior to incorporating them into clinical and research workflows [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. We also experimented with word embeddings, including Word2Vec and bidirectional encoder representations from transformers, but these approaches yielded similar predictive performance on our tasks. Thus, TF-IDF offered a practical balance of performance and interpretability. To better capture multiterm clinical concepts (such as &#x201C;water&#x201D; and &#x201C;tight&#x201D; signifying &#x201C;water-tight closure&#x201D; of the mucosa in cleft palate repair), as well as negation, we included unigrams and bigrams in our analysis. We also selected only the top 500 features for inclusion in our model. To prevent data leakage, the TF-IDF vectorizer and feature selection were fit exclusively on the training fold during each cross-validation iteration and subsequently applied to the held-out test fold. The resulting TF-IDF matrix served as the feature set for the classification model.</p><p>The decision threshold was chosen to optimize <italic>F</italic><sub>1</sub>-scores, as the <italic>F</italic><sub>1</sub>-score is a combination of precision and recall, both of which are helpful for different use cases, such as precision for screening cohorts and recall for registry automation. For each fold, the <italic>F</italic><sub>1</sub>-score at multiple thresholds was calculated, and the optimal threshold was chosen. The average threshold across folds was selected for the final model.</p></sec><sec id="s2-5"><title>Primary Classification: Procedure Type</title><p>For primary classification, a multilabel classification model was developed to automatically categorize operative notes into 5 distinct procedure types common in craniofacial surgery: cleft lip repair, alveolar bone grafting (ABG), cleft palate repair, velopharyngeal insufficiency (VPI) correction, rhinoplasty, and other procedures. &#x201C;Other&#x201D; procedures include noncleft and craniofacial procedures, such as myringotomy and tympanostomy tube placement, as well as cleft and craniofacial procedures with only a small number of procedures recorded, that is, oronasal fistula repair (14 procedures) and orthognathic repositioning (6 procedures).</p><p>The model used multilabel classification, as a single operative note frequently described multiple procedure types performed during the same surgery, and it is important to identify combination cases. For example, it is common for a cleft lip repair to be performed alongside a limited-tip rhinoplasty. Unlike traditional multiclass classification, where each instance belongs to exactly one class, multilabel classification allows each operative note to be assigned to any number of relevant categories simultaneously, from zero to all 5 procedure types. This approach reflects the clinical reality of complex craniofacial surgeries, where multiple procedures are often performed during a single operation.</p></sec><sec id="s2-6"><title>Secondary Classification: Procedure Subtype or Stage</title><p>For secondary classification, OvR classifiers that were able to distinguish between subclassifications of procedures were developed. These classifiers were developed in isolation, given ground-truth primary classification. For example, given cleft lip repairs, the classifier could determine whether the procedure was a primary repair or revision. Classifiers like these were developed for cleft and craniofacial procedures in the dataset that had sufficient examples for testing (&#x003E;1 in the test set). Procedures with enough samples for subclassification include cleft lip and ABG. Procedures such as cleft lip major and minor revisions, as well as revision ABG, had low counts of 4, 5, and 9, respectively. For these procedures, the dataset was augmented using synthetic notes generated by an LLM.</p></sec><sec id="s2-7"><title>Tertiary Classification: Technique</title><p>At the most granular level of our hierarchical framework, a classifier was developed that could distinguish between specific surgical techniques used within procedural subclassifications. This classifier was developed in isolation, given ground-truth primary and secondary classifications. In the context of cleft lip repairs, for instance, the tertiary classifier discriminates between the Fisher, modified rotation-advancement, Mulliken, and other techniques. For cleft palate repairs, the classifier distinguishes between von Langenbeck, Bardach, Veau-Wardill-Kilner, Furlow, Sommerlad, and other palatoplasty techniques. For VPI repairs, the classifier distinguishes between VPI repair by cleft palate revision or sphincteroplasty or pharyngoplasty. Synthetic note generation was used for notes describing VPI repair by cleft palate revision, for which there were only 6 examples.</p></sec><sec id="s2-8"><title>Model Evaluation</title><p>The performance of the classification model was assessed using multiple metrics. To evaluate the model&#x2019;s discrimination ability, the area under the receiver operating characteristic curve (AUC) and the area under the precision-recall curve (AUPRC) were calculated for each procedure type individually. We also report a macro-average across all classes, which provides insight into the model&#x2019;s performance regardless of class prevalence. The balance between precision and recall was assessed by the <italic>F</italic><sub>1</sub>-score, reported as both micro and macro averages to account for potential class imbalances in the dataset. Additionally, the fraction of incorrectly predicted labels was assessed using Hamming Loss, which is particularly informative in multilabel classification scenarios where partial correctness is meaningful.</p><p>All data processing and modeling were implemented in Python 3.11, utilizing scikit-learn (version 1.5.1) for machine learning operations, pandas for data manipulation (version 2.2.3), matplotlib for visualization (version 3.9.2), and spaCy for natural language processing (version 3.8.4). We maintained a fixed random seed to ensure the reproducibility of results.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>This work was performed under institutional review board approval (Pro00104806) for participation in the ACCQUIREnet, a multisite collaborative network. Participating sites in the ACCQUIREnet collect standardized outcomes data and variables that were defined according to the ICHOM&#x2019;s Standard Set of Outcome Metrics for the Comprehensive Appraisal of Cleft Care. The ACCQUIREnet is registered with ClinicalTrials.gov as an observational cohort study (#NCT02702869). All adult participants and parents of minor participants provided consent for participation in the data-collection processes of the network. In this work, operative notes and case classifications were reviewed from our single institution (not the entire network). Data processing and analysis were guided by all institutional policies and regulations regarding the protection of personal information, privacy, and human rights.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Dataset Characteristics</title><p>The condition-specific database contained 656 operative notes for analysis. Twenty-six notes could not be parsed to extract a procedure description, resulting in a final cohort of 630 notes from 311 patients. The exclusion of the 26 notes resulted in the complete removal of 1 patient from the cohort. Of the 26 notes that could not be parsed, 3 contained unrecognized headers, 8 had no identifiable header, and 15 had improperly formatted headers (eg, lowercase or indistinguishable from body text). These notes represented cleft and craniofacial procedures between 2016 and 2024. Patients had a mean age of 3.75 (SD 4.1, range 0&#x2010;19) years. In 184 instances, multiple procedures were performed during the same anesthetic event. In total, 958 procedures were documented across the 630 notes.</p><p>Procedures included the following: 100 cleft lip repair (primary, secondary major, and minor revision), 105 alveolar bone grafting (primary and secondary revision), 104 cleft palate repair, 32 velopharyngeal insufficiency correction (via cleft palate revision, sphincteroplasty, or pharyngoplasty), and 36 rhinoplasty (limited-tip). The 581 &#x201C;Other&#x201D; procedures included 191 (32.7%) auditory procedures (audiometry evoked-potential brain response testing, myringotomy and tympanostomy tube placement, and patch myringoplasty), 92 (15.8%) suture removal, 33 (5.8%) dental rehabilitation, 14 (2.4%) oronasal fistula repair, 6 (1%) cranial procedures (suturectomy, cranial vault expansion, and fronto-orbital advancement), 6 (1%) orthognathic repositioning, 5 (0.9%) gastrostomy, and 234 (40.3%) heterogeneous group of less common interventions spanning multiple specialties, including ophthalmologic (eg, strabismus surgery, cataract extraction), neurosurgical (eg, ventriculoperitoneal shunt placement and revision, Chiari decompression), orthopedic (eg, fracture fixation, spinal fusion), general surgical (eg, exploratory laparotomy, bowel resection), urologic (eg, cystourethroscopy), and cardiothoracic procedures (eg, congenital heart defect repair).</p><p>Procedure descriptions averaged 244 words (median 223 words), ranging from 5 to 809 words. The corpus contained 5175 unique tokens, with &#x201C;patient,&#x201D; &#x201C;place,&#x201D; and &#x201C;suture&#x201D; being the most frequent. The type-token ratio of 3.36% indicated substantial lexical repetition and limited vocabulary diversity across operative notes.</p></sec><sec id="s3-2"><title>Primary Classification Performance</title><p>The model demonstrated strong performance in distinguishing procedure types, achieving an AUC of 0.93 (SD 0.04), an AUPRC of 0.84 (SD 0.05), a micro-averaged <italic>F</italic><sub>1</sub>-score of 0.88 (SD 0.02), a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.84 (SD 0.03), and a Hamming loss of 0.04 (SD 0.01). The AUC and AUPRC curves are shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>(A) Receiver operating characteristic curve of the primary classifier. The curve demonstrates the primary classifier&#x2019;s performance for each procedure type as compared to a no-skill classifier. (B) Precision-recall curve of the primary classifier. ABG: alveolar bone grafting; AUC: area under the receiver operating characteristic curve; AUPRC: area under the precision-recall curve; VPI: velopharyngeal insufficiency.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87133_fig02.png"/></fig></sec><sec id="s3-3"><title>Secondary Subclassification Performance</title><p>In distinguishing subclassifications, such as primary cleft lip repairs versus revisions, the model demonstrated variable performance. Performance was excellent for cleft lip revision, with an AUC score of 1.0 (SD 0.00), an AUPRC of 1.0 (SD 0.0), a micro-averaged <italic>F</italic><sub>1</sub>-score of 0.96 (SD 0.02), a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.90 (SD 0.03), and a Hamming loss of 0.06 (SD 0.0) . In contrast, performance for ABG was poor, with an AUC of 0.49 (SD 0.02), an AUPRC of 0.21 (SD 0.14), a micro-averaged <italic>F</italic><sub>1</sub>-score of 0.61 (SD 0.02), a macro-averaged <italic>F</italic><sub>1</sub>-score of 0.41 (SD 0.05), and a Hamming loss of 0.39 (SD 0.02), indicating that the model failed to discriminate between primary and revision ABG procedures. Receiver-operating characteristic and precision-recall curves for secondary classifiers are shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>. The slight discrepancy between AUC and <italic>F</italic><sub>1</sub>-score for the cleft lip subclassification model reflects the use of a threshold averaged across folds, which may not be optimal for each individual fold. Cleft palate and rhinoplasty procedures had only 1 category represented in the data (primary cleft palate repair and limited-tip rhinoplasty), and therefore a classifier was not developed due to the lack of real examples for testing. VPI revision subclassification was not possible, as only the VPI technique was recorded in our dataset.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>(A) Receiver operating characteristic and precision-recall curves for the cleft lip subclassification model, distinguishing primary versus revision procedures. (B) Receiver operating characteristic and precision-recall curves for the alveolar bone grafting (ABG) subclassification model, distinguishing primary versus revision procedures. AUC: area under the receiver operating characteristic curve; AUPRC: area under the precision-recall curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87133_fig03.png"/></fig></sec><sec id="s3-4"><title>Tertiary Technique Classification Performance</title><p>The model for identifying procedure techniques also exhibited good performance, with AUCs ranging from 0.87 to 0.90. The full results are shown in <xref ref-type="table" rid="table1">Table 1</xref>. Briefly, the cleft lip technique classifier distinguishes between Fisher, modified rotation-advancement, Mulliken, and other techniques. The cleft palate technique classifier distinguishes between von Langenbeck, Bardach, Furlow, Veau-Wardill-Kilner, Sommerlad, and other techniques. The VPI technique classifier distinguishes between VPI repair by cleft palate revision and sphincteroplasty or pharyngoplasty. In our dataset, only 4 ABG procedures had associated techniques recorded, which was insufficient for developing a reliable classifier. Receiver-operating characteristic and precision-recall curves for tertiary classifiers are shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance metrics for tertiary classifiers identifying specific surgical techniques<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Technique</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (micro)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (macro)</td><td align="left" valign="bottom">Hamming loss</td></tr></thead><tbody><tr><td align="left" valign="top">Cleft lip, mean (SD)</td><td align="left" valign="top">0.88 (0.03)</td><td align="left" valign="top">0.72 (0.08)</td><td align="left" valign="top">0.77 (0.04)</td><td align="left" valign="top">0.77 (0.06)</td><td align="left" valign="top">0.22 (0.04)</td></tr><tr><td align="left" valign="top">Cleft palate, mean (SD)</td><td align="left" valign="top">0.89 (0.03)</td><td align="left" valign="top">0.67 (0.05)</td><td align="left" valign="top">0.65 (0.05)</td><td align="left" valign="top">0.65 (0.07)</td><td align="left" valign="top">0.36 (0.05)</td></tr><tr><td align="left" valign="top">VPI<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup>, mean (SD)</td><td align="left" valign="top">0.89 (0.09)</td><td align="left" valign="top">0.97 (0.03)</td><td align="left" valign="top">0.81 (0.01)</td><td align="left" valign="top">0.45 (0.01)</td><td align="left" valign="top">0.19 (0.01)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Metrics for categories with very small sample sizes (eg, velopharyngeal insufficiency repair by cleft palate revision, n=6) should be interpreted with caution, as limited representation in cross-validation may result in unstable estimates.</p></fn><fn id="table1fn2"><p><sup>b</sup>AUC: area under the receiver operating characteristic curve.</p></fn><fn id="table1fn3"><p><sup>c</sup>AUPRC: area under the precision-recall curve.</p></fn><fn id="table1fn4"><p><sup>d</sup>VPI: elopharyngeal insufficiency.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>(A) Receiver operating characteristic and precision-recall curves for the cleft lip technique classifier. (B) Receiver operating characteristic and precision-recall curves for the cleft palate technique classifier. (C) Receiver operating characteristic and precision-recall curves for the velopharyngeal insufficiency technique classifier. AUC: area under the receiver operating characteristic curve; AUPRC: area under the precision-recall curve; VPI: velopharyngeal insufficiency.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87133_fig04.png"/></fig></sec><sec id="s3-5"><title>Feature Importance</title><p>Feature importance analysis identified key terms contributing to each procedure classification for the primary classifier. The top-ranked words reflected distinct operative terminology across categories. The relative importance values of the top 10 words for each procedure type are shown in <xref ref-type="fig" rid="figure5">Figure 5</xref>.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Primary classifier explanations. Top 10 most important term frequency-inverse document frequency (TF-IDF) features for each classification label, based on model coefficients. Bar color reflects normalized importance within each class.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="medinform_v14i1e87133_fig05.png"/></fig></sec><sec id="s3-6"><title>Overcoming Limitations of CPT Coding</title><p>To further emphasize the need for cohort definition tools beyond CPT codes, we examined the CPT codes assigned to manually labeled VPI and ABG procedures, which are frequently miscoded due to the limited granularity of CPT coding. <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref> illustrate the heterogeneity of CPT coding for VPI and ABG procedures, respectively, demonstrating the multiple ways identical or highly similar procedures are coded. In total, 32 VPI procedures and 105 ABG procedures were analyzed. Of note, these are two substantially different procedures, and yet the CPT codes overlap due to the generality of the codes, arbitrariness in selecting a code, and possibly even miscoding.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Current Procedural Terminology (CPT) code variability across velopharyngeal insufficiency repair procedures (N=32).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">CPT code</td><td align="left" valign="bottom">Description (if applicable)</td><td align="left" valign="bottom">Procedures, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">42226</td><td align="left" valign="top">Lengthening of palate and pharyngeal flap</td><td align="left" valign="top">13 (40.6)</td></tr><tr><td align="left" valign="top">42950</td><td align="left" valign="top">Pharyngoplasty (plastic or reconstructive operation on pharynx)</td><td align="left" valign="top">7 (21.9)</td></tr><tr><td align="left" valign="top">42220</td><td align="left" valign="top">Palatoplasty for cleft palate; secondary lengthening procedure</td><td align="left" valign="top">4 (12.5)</td></tr><tr><td align="left" valign="top">42210</td><td align="left" valign="top">Palatoplasty for cleft palate, with closure of alveolar ridge; with bone graft to alveolar ridge (includes obtaining graft)</td><td align="left" valign="top">3 (9.4)</td></tr><tr><td align="left" valign="top">42215</td><td align="left" valign="top">Palatoplasty for cleft palate; attachment pharyngeal flap</td><td align="left" valign="top">2 (6.3)</td></tr><tr><td align="left" valign="top">42225</td><td align="left" valign="top">Palatoplasty for cleft palate; major revision</td><td align="left" valign="top">2 (6.3)</td></tr><tr><td align="left" valign="top">42200</td><td align="left" valign="top">Palatoplasty for cleft palate, soft and/or hard palate only</td><td align="left" valign="top">1 (3.1)</td></tr></tbody></table></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Current Procedural Terminology (CPT) code variability across alveolar bone grafting procedures (N=105).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">CPT code</td><td align="left" valign="bottom">Description (if applicable)</td><td align="left" valign="bottom">Procedures, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">42210</td><td align="left" valign="top">Palatoplasty for cleft palate, with closure of alveolar ridge; with bone graft to alveolar ridge (includes obtaining graft)</td><td align="left" valign="top">68 (64.8)</td></tr><tr><td align="left" valign="top">40700</td><td align="left" valign="top">Plastic repair of cleft lip/nasal deformity; primary, partial or complete, unilateral</td><td align="left" valign="top">23 (21.9)</td></tr><tr><td align="left" valign="top">42200</td><td align="left" valign="top">Palatoplasty for cleft palate, soft and/or hard palate only</td><td align="left" valign="top">7 (6.7)</td></tr><tr><td align="left" valign="top">42205</td><td align="left" valign="top">Palatoplasty for cleft palate, with closure of alveolar ridge; soft tissue only</td><td align="left" valign="top">4 (3.8)</td></tr><tr><td align="left" valign="top">40701</td><td align="left" valign="top">Plastic repair of cleft lip/nasal deformity; primary bilateral, 1-stage procedure</td><td align="left" valign="top">2 (1.9)</td></tr><tr><td align="left" valign="top">21210</td><td align="left" valign="top">Graft, bone; nasal, maxillary, or malar areas (includes obtaining graft)</td><td align="left" valign="top">1 (1)</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study demonstrates that machine learning approaches can effectively classify operative notes in pediatric craniofacial surgery with high accuracy across multiple levels of granularity. The primary classification model achieved strong performance (AUC 0.93, SD 0.04) in distinguishing between 5 distinct procedure types common in craniofacial surgery. Secondary classifiers for procedure subclassifications demonstrated variable performance, with AUC scores of 1.0 (SD 0.00) for cleft lip revision and 0.49 (SD 0.02) for ABG. At the most granular level of surgical technique identification, our model maintained good discriminative ability (AUC 0.88, SD 0.03 to AUC 0.89, SD 0.09).</p><p>The model for ABG secondary subclassification demonstrated poor performance (AUC 0.49, SD 0.02). This is likely due to substantial overlap in operative and clinical language between primary and revision procedures. In many revision cases, surgeons perform repeat alveolar bone grafting using techniques similar to those used in primary repairs, leading to documentation that does not reliably distinguish initial from secondary procedures. As a result, the textual features available to the model may be insufficient to support accurate subclassification.</p><p>The VPI technique classifier demonstrated a notable discrepancy between the micro-averaged <italic>F</italic><sub>1</sub>-score (0.81, SD 0.01) and the macro-averaged <italic>F</italic><sub>1</sub>-score (0.45, SD 0.01), indicating a performance imbalance across technique categories. This gap suggests that while the model achieves high overall accuracy, it performs poorly on the minority technique. In our dataset, VPI repairs were categorized as either sphincteroplasty/pharyngoplasty or repair by cleft palate revision, with the latter representing only 6 real examples that required synthetic augmentation. The high micro-<italic>F</italic><sub>1</sub>-score indicates strong performance on the more prevalent sphincteroplasty/pharyngoplasty category, which dominates the overall metric due to its larger sample size. In contrast, the low macro-<italic>F</italic><sub>1</sub>-score reveals that the model struggles to correctly identify VPI repairs performed via cleft palate revision, likely due to substantial overlap in operative terminology between these approaches.</p><p>Additionally, AUC estimates for these very small classes (eg, VPI repair by cleft palate revision, n=6) are unstable and should be interpreted as exploratory until larger labeled datasets are available. The perfect AUC observed for the cleft lip secondary classifier should also be interpreted with caution. Unlike the primary classifier, this model used note-level rather than patient-level cross-validation due to limited sample size, meaning notes from the same patient or surgeon may have appeared in both training and test sets. The AUC of 1.0 may overestimate real-world generalizability.</p><p>This classification system has several important clinical implications. First, this study provides proof-of-concept that automated classification could substantially reduce the administrative burden on clinical teams. In many studies, traditional coding systems, such as CPT and <italic>International Classification of Diseases</italic> (<italic>ICD</italic>), codes are used for procedural and diagnostic classification. However, CPT coding has been shown to be inconsistent for craniofacial procedures due to inconsistent CPT codes that fail to capture complex techniques [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. The pediatric craniofacial population presents unique documentation challenges due to the longitudinal nature of care, with patients often undergoing multiple staged procedures over many years. CPT codes frequently lack the granularity necessary to distinguish between subtle but clinically significant variations in surgical technique. For instance, CPT code 42200 encompasses multiple distinct approaches to cleft palate repair that have different implications for surgical planning and outcomes research. Additionally, procedures may be coded inaccurately&#x2014;for example, the correction of VPI or ABG procedures may use a cleft palate repair CPT code. In the case of VPI, revision palatoplasty is one approach for correcting the disorder. In the case of ABG, no specific CPT code exists for bone grafting of the alveolar cleft, so it is conventional to use a &#x201C;close-enough&#x201D; CPT code for palatoplasty with bone grafting. The consequence of this &#x201C;real-world&#x201D; coding practice has been that manual review was needed to accurately classify complex procedure types. Of course, manual review and classification of operative notes are time-consuming and subject to human error. The substantial variability in CPT coding observed for both VPI and ABG procedures underscores the limitations of billing-based classification systems and highlights the added value of NLP-derived tertiary classifications for capturing clinically meaningful procedural detail. A major goal of this project was to provide an accurate, reliable, efficient, and sustainable method for automating case classification from the unstructured text data of the operative note, allowing for more accurate and efficient cohort identification.</p><p>Second, the analysis of feature importance provides insight into the distinctive language patterns that characterize different craniofacial procedures. The terms most strongly associated with each procedure type align with the clinical understanding of these operations, suggesting that the model has learned clinically relevant patterns. This interpretability is crucial for clinical adoption, as it allows surgeons to understand and validate the model&#x2019;s decision-making process.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>While prior work applying NLP to operative notes in cleft and craniofacial surgery is limited, related efforts in surgical subspecialties and clinical text classification provide useful points of comparison. A growing body of literature has demonstrated the potential of NLP for classifying surgical procedures based on operative notes, with many studies focusing on predicting CPT codes for billing purposes in fields such as general surgery [<xref ref-type="bibr" rid="ref23">23</xref>], breast surgery [<xref ref-type="bibr" rid="ref10">10</xref>], spine surgery [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], and pathology [<xref ref-type="bibr" rid="ref26">26</xref>]. Recently, LLMs have been evaluated for CPT coding in craniofacial surgery given operative notes [<xref ref-type="bibr" rid="ref27">27</xref>]. However, the accuracies of all 5 models tested ranged between 20% and 40%. These low accuracies suggest that general-purpose LLMs may be poorly suited for granular CPT code classification in highly specialized surgical domains, particularly when models are not trained on domain-specific operative language. Similar findings have been reported in other studies using generalized LLMs, which demonstrate low accuracy in classifying surgical procedures [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>] and CPT modifiers in craniofacial operative notes [<xref ref-type="bibr" rid="ref30">30</xref>]. Collectively, this work indicates that procedure classification may require models trained on specialized operative notes rather than relying on generalized LLMs alone.</p><p>Our work addresses this challenge within cleft and craniofacial surgery&#x2014;a specialty characterized by complex, multicomponent procedures and longitudinal care spanning years of development&#x2014;where the precise identification of surgical techniques is crucial for patient outcomes and clinical research. The <italic>hierarchical</italic> approach to classification, moving from procedure <italic>type</italic> (eg, cleft lip repair) to <italic>subclassification</italic> (eg, subtype or stage, such as primary vs revision) to <italic>specific technique</italic> (eg, Fisher anatomic subunit technique), represents a more nuanced and clinically relevant analysis than most previous studies, which typically focus on primary procedure identification or CPT code assignment only. Moreover, our approach does not treat procedural coding as the end goal. Rather, CPT codes serve primarily as a point of comparison. Instead, the proposed model enables more efficient and accurate cohort identification for outcomes reporting and surgical research by capturing both procedure type and detailed operative technique.</p><p>Another study highlights the development of a tool, ChartSweep, to automate chart review, which can be used for cohort identification [<xref ref-type="bibr" rid="ref31">31</xref>]. The study highlights the significant amount of time spent on retrospective chart review compared to an automated tool (8 minutes per patient with manual chart review vs 0.3 minutes per patient with the automated tool for the identification of a radiofrequency ablation cohort). These findings underscore the inefficiency of manual chart review in plastic surgery research. Similar to ChartSweep, our study aims to reduce this burden through automation; however, we specifically address the challenge of classifying cleft and craniofacial procedures, where overlapping and reused operative terminology complicates automated extraction, an area not examined in the prior work.</p><p>We additionally apply a synthetic data generation technique to address class imbalance in rare procedure types, demonstrating a solution to a common challenge in specialty surgical fields with relatively low procedure volumes. Several studies have demonstrated the ability of LLMs to generate satisfactory medical notes. Such synthetic notes have been shown to be sufficiently realistic for downstream analytic tasks, with multiple studies proposing their use as a viable strategy to mitigate data scarcity [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref40">40</xref>]. In this context, our work extends the existing literature by demonstrating the use of synthetic data specifically for hierarchical surgical procedure classification, supporting its role in improving model performance for rare but clinically meaningful procedures.</p></sec><sec id="s4-3"><title>Limitations</title><p>First, our study used data from a single institution, potentially restricting the generalizability of our models to other clinical settings with different documentation practices. Operative notes can vary significantly in structure, terminology, and level of detail across institutions and individual surgeons. To address this concern, future studies should validate these models using multi-institutional data to assess their robustness across different documentation styles. This is planned as future work for the ACCQUIREnet.</p><p>Second, surgical notes often rely on templates for documentation efficiency. It is a concern that the model is overfitting to templated data. In this project, the templates for cleft and craniofacial operative notes have changed significantly in the time span encompassed by the study, and operative notes from multiple surgeons are present. This variability does diminish (but does not eliminate) the risk of overfitting. Feature importance analysis identified several highly weighted predictors (eg, phrases such as &#x201C;wish proceed&#x201D; and &#x201C;take operative&#x201D;) that are not inherently clinical and may reflect aspects of documentation style or templated language rather than surgical technique. This suggests that the model may be partially leveraging institution- or surgeon-specific phrasing patterns. Although variability in note authorship and evolving templates may reduce this effect over time, it highlights a potential limitation related to generalizability, particularly when applying the model to external datasets with different documentation conventions. Training on notes from other institutions will likely be most effective in resolving this issue.</p><p>Third, the augmentation of the training data using LLM-generated synthetic notes for procedure subclassification is another potential source of overfitting. Three examples each of cleft lip major revision, cleft lip minor revision, and ABG revision were provided for synthetic data generation. Although the effort was made to select diverse techniques and note styles for these examples, these synthetic notes may, nonetheless, be most similar to the examples provided. In this pilot project, synthetic notes were necessary to explore the feasibility of subclassification for rare procedures. In the future, we will solicit &#x201C;real-world&#x201D; operative notes in these categories from across the ACCQUIREnet.</p><p>Fourth, our secondary and tertiary classifiers were developed in isolation, given &#x201C;ground-truth&#x201D; labeling of preceding classifications. In reality, the hierarchical classification system will need to identify primary, secondary, and tertiary classifications in sequence. Errors in primary classification could permeate into subsequent steps. Therefore, before putting such a system into production, it will be necessary to assess cumulative system accuracy.</p><p>Fifth, we excluded a small subset of operative notes that could not be parsed due to missing or nonstandard headers. While this may introduce a minor degree of survivorship bias toward more consistently formatted notes, the proportion excluded was small and unlikely to meaningfully impact overall findings. Future work could incorporate more flexible parsing approaches to further improve robustness to formatting variability.</p><p>Sixth, the content validity of the synthetic operative notes was appraised by a senior cleft and craniofacial surgeon (ACA) who is also the corresponding author and lead investigator of this study. As such, this review was internal rather than independent, and the possibility of confirmation bias cannot be excluded. In the future, an independent, blinded clinical review by surgeons unaffiliated with the study would provide stronger evidence of the synthetic notes&#x2019; similarity to real-world operative documentation.</p><p>Seventh, the ground-truth labels used to train and evaluate all classifiers were generated by a single experienced annotator (CJ), with uncertain cases resolved by a senior surgeon (ACA). Although this annotator has extensive experience in cleft and craniofacial procedural classification, no independent audit was performed on a subset of notes to calculate a formal interrater reliability score. Consequently, the degree of individual annotator bias cannot be quantified, and the possibility that a second annotator would classify some notes differently cannot be excluded. Future work should include independent annotation of a representative subset of notes by a second qualified annotator, with the calculation of a formal interrater reliability statistic (such as Cohen kappa), to better characterize label quality and support confidence in the ground-truth dataset.</p></sec><sec id="s4-4"><title>Future Directions</title><p>Future directions for this research include prospective validation in a clinical setting, expansion to multi-institutional datasets, and incorporation of additional variables from structured data sources. In addition, it is interesting to note that while it is problematic to go from CPT code to procedural classification, as we have shown in this paper, the opposite direction is easy&#x2014;to derive an appropriate CPT code from a classified case; therefore, an impactful extension of our work may be to improve CPT coding (and thus billing) using NLP-based classification of operative notes.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study demonstrates the feasibility of automatic classification of pediatric craniofacial operative notes across multiple levels of granularity, from primary procedure identification to specific surgical technique recognition. This study paves the way for further development and deployment of such systems, which could significantly reduce the administrative burden involved in surgical research, operations, and quality improvement.</p></sec></sec></body><back><ack><p>The code used to generate the models in this study is available on GitHub [<xref ref-type="bibr" rid="ref41">41</xref>]. Generative artificial intelligence tools were not used at any stage in the preparation of this manuscript.</p></ack><notes><sec><title>Funding</title><p>The authors declared no financial support was received for this work</p></sec><sec><title>Data Availability</title><p>The data cannot be shared publicly because they contain potentially identifying or sensitive patient information. Data are available from the ACCQUIREnet Committee (contact via info@accquire.net) for researchers who meet the criteria for access to the confidential data.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: ACA</p><p>Data curation: CJ</p><p>Formal analysis: MC, EL (supporting)</p><p>Investigation: MC</p><p>Methodology: MC, EL (supporting)</p><p>Project administration: ACA (lead), SKM (supporting), NYL (supporting)</p><p>Resources: ACA, CJ (supporting)</p><p>Software: MC</p><p>Supervision: ACA</p><p>Validation: MC</p><p>Visualization: MC (lead)</p><p>Writing &#x2013; original draft: MC (lead), ACA (supporting)</p><p>Writing &#x2013; review &#x0026; editing: ACA (lead), EL (supporting), NO (supporting), SKM (supporting), NYL (supporting)</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ABG</term><def><p>alveolar bone grafting</p></def></def-item><def-item><term id="abb2">ACCQUIREnet</term><def><p>Allied Cleft &#x0026; Craniofacial Quality-Improvement and Research Network</p></def></def-item><def-item><term id="abb3">AUC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb4">AUPRC</term><def><p>area under the precision-recall curve</p></def></def-item><def-item><term id="abb5">CPT</term><def><p>Current Procedural Terminology</p></def></def-item><def-item><term id="abb6"><italic>ICD</italic></term><def><p><italic>International Classification of Diseases</italic></p></def></def-item><def-item><term id="abb7">ICHOM</term><def><p>International Consortium for Health Outcomes Measurement</p></def></def-item><def-item><term id="abb8">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb9">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb10">OvR</term><def><p>One-vs-Rest</p></def></def-item><def-item><term id="abb11">TF-IDF</term><def><p>term frequency-inverse document frequency</p></def></def-item><def-item><term id="abb12">VPI</term><def><p>velopharyngeal insufficiency</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>NT</given-names> </name><name name-style="western"><surname>Salinas</surname><given-names>J</given-names> </name></person-group><article-title>Machine learning for predicting outcomes in trauma</article-title><source>Shock</source><year>2017</year><month>11</month><volume>48</volume><issue>5</issue><fpage>504</fpage><lpage>510</lpage><pub-id pub-id-type="doi">10.1097/SHK.0000000000000898</pub-id><pub-id pub-id-type="medline">28498299</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>PF</given-names> </name><name name-style="western"><surname>Diao</surname><given-names>YD</given-names> </name><name name-style="western"><surname>McMullen</surname><given-names>EP</given-names> </name><name name-style="western"><surname>Manka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name></person-group><article-title>Predicting amputation using machine learning: a systematic review</article-title><source>PLOS ONE</source><year>2023</year><volume>18</volume><issue>11</issue><fpage>e0293684</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0293684</pub-id><pub-id pub-id-type="medline">37934767</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kiik</surname><given-names>M</given-names> </name><name name-style="western"><surname>Peek</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A systematic review of machine learning models for predicting outcomes of stroke with structured data</article-title><source>PLOS ONE</source><year>2020</year><volume>15</volume><issue>6</issue><fpage>e0234722</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0234722</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tayefi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ngo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chomutare</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Challenges and opportunities beyond structured data in analysis of electronic health records</article-title><source>WIREs Computational Stats</source><year>2021</year><month>11</month><volume>13</volume><issue>6</issue><fpage>e1549</fpage><pub-id pub-id-type="doi">10.1002/wics.1549</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ewings</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Konofaos</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>RD</given-names> </name></person-group><article-title>Variations in current procedural terminology coding for craniofacial surgery: a need for review and change</article-title><source>J Craniofac Surg</source><year>2017</year><month>07</month><volume>28</volume><issue>5</issue><fpage>1224</fpage><lpage>1228</lpage><pub-id pub-id-type="doi">10.1097/SCS.0000000000003667</pub-id><pub-id pub-id-type="medline">28665842</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>YM</given-names> </name><name name-style="western"><surname>Bacchi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sia</surname><given-names>D</given-names> </name><name name-style="western"><surname>Casson</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>W</given-names> </name></person-group><article-title>Optimising vitrectomy operation note coding with machine learning</article-title><source>Clinical Exper Ophthalmol</source><year>2023</year><month>08</month><volume>51</volume><issue>6</issue><fpage>577</fpage><lpage>584</lpage><pub-id pub-id-type="doi">10.1111/ceo.14257</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burns</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Mathis</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Vandervest</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Classification of current procedural terminology codes from electronic health record data using machine learning</article-title><source>Anesthesiology</source><year>2020</year><month>04</month><volume>132</volume><issue>4</issue><fpage>738</fpage><lpage>749</lpage><pub-id pub-id-type="doi">10.1097/ALN.0000000000003150</pub-id><pub-id pub-id-type="medline">32028374</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roy</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Self</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Isch</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Evaluating large language models for automated CPT code prediction in endovascular neurosurgery</article-title><source>J Med Syst</source><year>2025</year><month>01</month><day>24</day><volume>49</volume><issue>1</issue><fpage>15</fpage><pub-id pub-id-type="doi">10.1007/s10916-025-02149-4</pub-id><pub-id pub-id-type="medline">39853605</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shost</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Meade</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Steinmetz</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Mroz</surname><given-names>TE</given-names> </name><name name-style="western"><surname>Habboub</surname><given-names>G</given-names> </name></person-group><article-title>Surgical classification using natural language processing of informed consent forms in spine surgery</article-title><source>Neurosurg Focus</source><year>2023</year><month>06</month><volume>54</volume><issue>6</issue><fpage>E10</fpage><pub-id pub-id-type="doi">10.3171/2023.3.FOCUS2371</pub-id><pub-id pub-id-type="medline">37283446</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Moheb</surname><given-names>M</given-names> </name><name name-style="western"><surname>Putman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sears</surname><given-names>O</given-names> </name><etal/></person-group><article-title>An open-architecture AI model for CPT coding in breast surgery: development, validation, and prospective testing</article-title><source>Ann Surg</source><year>2025</year><month>09</month><day>1</day><volume>282</volume><issue>3</issue><fpage>439</fpage><lpage>448</lpage><pub-id pub-id-type="doi">10.1097/SLA.0000000000006793</pub-id><pub-id pub-id-type="medline">40518998</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harrison</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Edison</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Hallac</surname><given-names>RR</given-names> </name></person-group><article-title>Artificial intelligence applications in pediatric craniofacial surgery</article-title><source>Diagnostics (Basel)</source><year>2025</year><month>03</month><day>25</day><volume>15</volume><issue>7</issue><fpage>829</fpage><pub-id pub-id-type="doi">10.3390/diagnostics15070829</pub-id><pub-id pub-id-type="medline">40218180</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zambrano</surname><given-names>CDB</given-names> </name><name name-style="western"><surname>Jim&#x00E9;nez</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Rodr&#x00ED;guez</surname><given-names>AGM</given-names> </name><name name-style="western"><surname>Rinc&#x00F3;n</surname><given-names>EHH</given-names> </name></person-group><article-title>Revolutionizing cleft lip and palate management through artificial intelligence: a scoping review</article-title><source>Oral Maxillofac Surg</source><year>2025</year><month>04</month><day>10</day><volume>29</volume><issue>1</issue><fpage>79</fpage><pub-id pub-id-type="doi">10.1007/s10006-025-01371-1</pub-id><pub-id pub-id-type="medline">40208434</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Allori</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Kelley</surname><given-names>T</given-names> </name><name name-style="western"><surname>Meara</surname><given-names>JG</given-names> </name><etal/></person-group><article-title>A standard set of outcome measures for the comprehensive appraisal of cleft care</article-title><source>Cleft Palate Craniofac J</source><year>2017</year><month>09</month><volume>54</volume><issue>5</issue><fpage>540</fpage><lpage>554</lpage><pub-id pub-id-type="doi">10.1597/15-292</pub-id><pub-id pub-id-type="medline">27223626</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>ACCQUIREnet</article-title><source>Duke University School of Medicine (Duke Surgery)</source><access-date>2025-05-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://surgery.duke.edu/divisions/plastic-maxillofacial-and-oral-surgery/research/clinical-research/datalab-clinical-care-and-population-health/accquirenet">https://surgery.duke.edu/divisions/plastic-maxillofacial-and-oral-surgery/research/clinical-research/datalab-clinical-care-and-population-health/accquirenet</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hurst</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lerer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Goucher</surname><given-names>AP</given-names> </name><etal/></person-group><article-title>GPT-4o system card</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.21276</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Dedupe 3.0.2 documentation</article-title><source>Dedupe.io</source><access-date>2026-03-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.dedupe.io/en/latest/">https://docs.dedupe.io/en/latest/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Olthof</surname><given-names>AW</given-names> </name><name name-style="western"><surname>van Ooijen</surname><given-names>PMA</given-names> </name><name name-style="western"><surname>Cornelissen</surname><given-names>LJ</given-names> </name></person-group><article-title>Deep learning-based natural language processing in radiology: the impact of report complexity, disease prevalence, dataset size, and algorithm type on model performance</article-title><source>J Med Syst</source><year>2021</year><month>09</month><day>4</day><volume>45</volume><issue>10</issue><fpage>91</fpage><pub-id pub-id-type="doi">10.1007/s10916-021-01761-4</pub-id><pub-id pub-id-type="medline">34480231</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chaturvedi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shamsutdinova</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zimmer</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Sample size in natural language processing within healthcare research</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 5, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.02237</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>D</given-names> </name></person-group><article-title>Improving explainability and integrability of medical AI to promote health care professional acceptance and use: mixed systematic review</article-title><source>J Med Internet Res</source><year>2025</year><month>08</month><day>7</day><volume>27</volume><issue>1</issue><fpage>e73374</fpage><pub-id pub-id-type="doi">10.2196/73374</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tucci</surname><given-names>V</given-names> </name><name name-style="western"><surname>Saary</surname><given-names>J</given-names> </name><name name-style="western"><surname>Doyle</surname><given-names>TE</given-names> </name></person-group><article-title>Factors influencing trust in medical artificial intelligence for healthcare professionals: a narrative review</article-title><source>J Med Artif Intell</source><year>2022</year><month>03</month><day>30</day><volume>5</volume><fpage>4</fpage><pub-id pub-id-type="doi">10.21037/jmai-21-25</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Elver</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Thames</surname><given-names>C</given-names> </name><name name-style="western"><surname>Perry</surname><given-names>NK</given-names> </name><name name-style="western"><surname>Humphries</surname><given-names>LS</given-names> </name><name name-style="western"><surname>Hoppe</surname><given-names>IC</given-names> </name></person-group><article-title>Navigating Coding challenges in craniofacial surgery: a national survey analysis on CPT variability</article-title><source>Cleft Palate Craniofac J</source><year>2025</year><month>05</month><day>8</day><fpage>10556656251338640</fpage><pub-id pub-id-type="doi">10.1177/10556656251338640</pub-id><pub-id pub-id-type="medline">40336280</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jazayeri</surname><given-names>HE</given-names> </name><name name-style="western"><surname>Khavanin</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>JW</given-names> </name><etal/></person-group><article-title>Variability in Current Procedural Terminology codes for craniomaxillofacial trauma reconstruction: a national survey</article-title><source>J Craniofac Surg</source><year>2020</year><month>06</month><volume>31</volume><issue>4</issue><fpage>996</fpage><lpage>999</lpage><pub-id pub-id-type="doi">10.1097/SCS.0000000000006362</pub-id><pub-id pub-id-type="medline">32168130</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Balch</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Desaraju</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Nolan</surname><given-names>VJ</given-names> </name><etal/></person-group><article-title>Language models for multilabel document classification of surgical concepts in exploratory laparotomy operative notes: algorithm development study</article-title><source>JMIR Med Inform</source><year>2025</year><month>07</month><day>9</day><volume>13</volume><issue>1</issue><fpage>e71176</fpage><pub-id pub-id-type="doi">10.2196/71176</pub-id><pub-id pub-id-type="medline">40632815</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zaidat</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Arvind</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Can a novel natural language processing model and artificial intelligence automatically generate billing codes from spine surgical operative notes?</article-title><source>Global Spine J</source><year>2024</year><month>09</month><volume>14</volume><issue>7</issue><fpage>2022</fpage><lpage>2030</lpage><pub-id pub-id-type="doi">10.1177/21925682231164935</pub-id><pub-id pub-id-type="medline">36932733</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Vivas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Arvind</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Can natural language processing and artificial intelligence automate the generation of billing codes from operative note dictations?</article-title><source>Global Spine J</source><year>2023</year><month>09</month><volume>13</volume><issue>7</issue><fpage>1946</fpage><lpage>1955</lpage><pub-id pub-id-type="doi">10.1177/21925682211062831</pub-id><pub-id pub-id-type="medline">35225694</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Vattikonda</surname><given-names>N</given-names> </name><name name-style="western"><surname>Haudenschild</surname><given-names>C</given-names> </name><name name-style="western"><surname>Christensen</surname><given-names>B</given-names> </name><name name-style="western"><surname>Vaickus</surname><given-names>L</given-names> </name></person-group><article-title>Comparison of machine-learning algorithms for the prediction of current procedural terminology (CPT) codes from pathology reports</article-title><source>J Pathol Inform</source><year>2022</year><volume>13</volume><fpage>3</fpage><pub-id pub-id-type="doi">10.4103/jpi.jpi_52_21</pub-id><pub-id pub-id-type="medline">35127232</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Isch</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Sarikonda</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sambangi</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Evaluating the efficacy of large language models in CPT coding for craniofacial surgery: a comparative analysis</article-title><source>J Craniofac Surg</source><year>2025</year><month>05</month><day>1</day><volume>36</volume><issue>3</issue><fpage>831</fpage><lpage>835</lpage><pub-id pub-id-type="doi">10.1097/SCS.0000000000010575</pub-id><pub-id pub-id-type="medline">39221924</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soroush</surname><given-names>A</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Zimlichman</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Large language models are poor medical coders&#x2014;benchmarking of medical code querying</article-title><source>NEJM AI</source><year>2024</year><month>04</month><day>25</day><volume>1</volume><issue>5</issue><fpage>AIdbp2300040</fpage><pub-id pub-id-type="doi">10.1056/AIdbp2300040</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madris</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ranjbar</surname><given-names>K</given-names> </name><name name-style="western"><surname>Critsinelis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Myla</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Salehi</surname><given-names>P</given-names> </name></person-group><article-title>Evaluating artificial intelligence-assisted current procedural terminology coding in vascular surgery: a comparison of ChatGPT Plus and Perplexity Pro against finance department</article-title><source>Ann Vasc Surg</source><year>2026</year><month>03</month><volume>124</volume><fpage>28</fpage><lpage>36</lpage><pub-id pub-id-type="doi">10.1016/j.avsg.2025.10.045</pub-id><pub-id pub-id-type="medline">41237975</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Isch</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Guler</surname><given-names>M</given-names> </name><name name-style="western"><surname>Galantini</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Bridging the coding gap: assessing large language models for accurate modifier assignment in craniofacial operative notes</article-title><source>J Craniofac Surg</source><year>2025</year><month>04</month><day>11</day><volume>36</volume><fpage>2260</fpage><lpage>2263</lpage><pub-id pub-id-type="doi">10.1097/SCS.0000000000011390</pub-id><pub-id pub-id-type="medline">40214230</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chartier</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gfrerer</surname><given-names>L</given-names> </name><name name-style="western"><surname>Austen</surname><given-names>WG</given-names> </name></person-group><article-title>ChartSweep: a HIPAA-compliant tool to automate chart review for plastic surgery research</article-title><source>Plast Reconstr Surg Glob Open</source><year>2021</year><month>06</month><volume>9</volume><issue>6</issue><fpage>e3633</fpage><pub-id pub-id-type="doi">10.1097/GOX.0000000000003633</pub-id><pub-id pub-id-type="medline">34150426</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Litake</surname><given-names>O</given-names> </name><name name-style="western"><surname>Park</surname><given-names>BH</given-names> </name><name name-style="western"><surname>Tully</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Gabriel</surname><given-names>RA</given-names> </name></person-group><article-title>Constructing synthetic datasets with generative artificial intelligence to train large language models to classify acute renal failure from clinical notes</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>05</month><day>20</day><volume>31</volume><issue>6</issue><fpage>1404</fpage><lpage>1410</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae081</pub-id><pub-id pub-id-type="medline">38622901</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Seo</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Clinical assessment of fine-tuned open-source LLMs in cardiology: from progress notes to discharge summary</article-title><source>J Healthc Inform Res</source><year>2025</year><month>12</month><volume>9</volume><issue>4</issue><fpage>686</fpage><lpage>702</lpage><pub-id pub-id-type="doi">10.1007/s41666-025-00203-x</pub-id><pub-id pub-id-type="medline">41230246</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Latif</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name></person-group><article-title>Evaluation and analysis of large language models for clinical text augmentation and generation</article-title><source>IEEE Access</source><year>2024</year><volume>12</volume><fpage>48987</fpage><lpage>48996</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2024.3384496</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sufi</surname><given-names>F</given-names> </name></person-group><article-title>Addressing data scarcity in the medical domain: a GPT-based approach for synthetic data generation and feature extraction</article-title><source>Information</source><year>2024</year><month>05</month><volume>15</volume><issue>5</issue><fpage>264</fpage><pub-id pub-id-type="doi">10.3390/info15050264</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Benoit</surname><given-names>JRA</given-names> </name></person-group><article-title>ChatGPT for clinical vignette generation, revision, and evaluation</article-title><source>medRxiv</source><comment>Preprint posted online on  Feb 8, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.02.04.23285478</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Lee-Youngzie</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name></person-group><article-title>Synthetic data generation with LLM for improved depression prediction</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 26, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2411.17672</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biswas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Talukdar</surname><given-names>W</given-names> </name></person-group><article-title>Enhancing clinical documentation with synthetic data: leveraging generative models for improved accuracy</article-title><source>Int J Innov Sci Res Technol</source><year>2024</year><volume>9</volume><fpage>1553</fpage><lpage>1566</lpage><pub-id pub-id-type="doi">10.38124/ijisrt/IJISRT24MAY2085</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Alshaikhdeeb</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hemedan</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Balaur</surname><given-names>I</given-names> </name><name name-style="western"><surname>Satagopam</surname><given-names>V</given-names> </name></person-group><article-title>Generation of synthetic clinical text: a systematic review</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 24, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2507.18451</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barr</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Quan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sezgin</surname><given-names>E</given-names> </name></person-group><article-title>Large language models generating synthetic clinical datasets: a feasibility and comparative analysis with real-world perioperative data</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1533508</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1533508</pub-id><pub-id pub-id-type="medline">39974356</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="web"><article-title>Meredithcox/CleftNoteClassifier</article-title><source>GitHub</source><access-date>2026-04-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/meredithcox/CleftNoteClassifier">https://github.com/meredithcox/CleftNoteClassifier</ext-link></comment></nlm-citation></ref></ref-list></back></article>